Add README.md (#38 )

* Add README.md * Improve README description of TrigDx library * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> --------- Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>
Fix compiler warnings (#37 )
2025-10-22 16:55:37 +02:00 · 2025-10-22 16:48:26 +02:00 · 2025-10-20 17:09:35 +02:00 · 2025-10-10 09:19:18 +02:00 · 2025-09-17 20:03:28 +02:00 · 2025-09-03 09:16:28 +02:00
30 changed files with 862 additions and 305 deletions
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -0,0 +1,15 @@
+name: linting
+on:
+  push:
+jobs:
+  format:
+    name: Linting
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - run: sudo apt-get update
+      - run: sudo apt-get install -y clang-format-14 cmake-format pre-commit
+      - run: pre-commit run -a
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -0,0 +1,89 @@
+name: test
+on:
+  push:
+
+jobs:
+  build:
+    runs-on: [slurm]
+    strategy:
+      matrix: &matrix
+        include:
+          - name: GNU, NVIDIA A4000
+            partition: defq
+            gres: gpu:A4000
+            cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
+            environment_modules: "spack/20250403 intel-oneapi-mkl cuda python"
+          - name: Intel, NVIDIA A4000
+            partition: defq
+            gres: gpu:A4000
+            cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
+            environment_modules: "spack/20250403 intel-oneapi-compilers intel-oneapi-mkl cuda python"
+          - name: NVIDIA GH200
+            partition: ghq
+            gres: gpu:GH200
+            cmake_flags: "-DTRIGDX_USE_GPU=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=90a -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc"
+    env:
+      DEVICE_NAME: ${{ matrix.name }}
+      PARTITION_NAME: ${{ matrix.partition }}
+      CMAKE_FLAGS: ${{ matrix.cmake_flags }}
+      ENVIRONMENT_MODULES: ${{ matrix.environment_modules }}
+    steps:
+      - &cleanup
+        name: "Remove old workspace"
+        run: |
+          rm -rf $GITHUB_WORKSPACE/*
+          mkdir -p $GITHUB_WORKSPACE
+      - uses: actions/checkout@v4
+      - uses: astron-rd/slurm-action@v1.2
+        with:
+          partition: ${{ matrix.partition }}
+          gres: ${{ matrix.gres }}
+          commands: |
+            module load ${ENVIRONMENT_MODULES}
+            cmake -S . -B build ${CMAKE_FLAGS}
+            make -C build -j
+      - name: Upload build
+        uses: pyTooling/upload-artifact@v4
+        with:
+          name: build-${{ matrix.name }}
+          path: build
+
+  test:
+    runs-on: [slurm]
+    needs: build
+    strategy:
+      matrix: *matrix
+    env:
+      DEVICE_NAME: ${{ matrix.name }}
+      PARTITION_NAME: ${{ matrix.partition }}
+    steps:
+      - *cleanup
+      - uses: pyTooling/download-artifact@v4
+        with:
+          name: build-${{ matrix.name }}
+      - uses: astron-rd/slurm-action@v1.2
+        with:
+          partition: ${{ matrix.partition }}
+          gres: ${{ matrix.gres }}
+          commands: |
+            find build/tests -type f -executable -exec {} \;
+
+  benchmark:
+    runs-on: [slurm]
+    needs: build
+    strategy:
+      matrix: *matrix
+    env:
+      DEVICE_NAME: ${{ matrix.name }}
+      PARTITION_NAME: ${{ matrix.partition }}
+    steps:
+      - *cleanup
+      - uses: pyTooling/download-artifact@v4
+        with:
+          name: build-${{ matrix.name }}
+      - uses: astron-rd/slurm-action@v1.2
+        with:
+          partition: ${{ matrix.partition }}
+          gres: ${{ matrix.gres }}
+          commands: |
+            find build/benchmarks -type f -executable -exec {} \;
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,4 +7,5 @@ repos:
    rev: v0.6.13
    hooks:
      - id: cmake-format
-      - id: cmake-lint
+      - id: cmake-lint
+        args: [--disabled-codes=C0301]
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,77 +0,0 @@
-{
-  "files.associations": {
-    "*.cc": "cpp",
-    "*.tpp": "cpp",
-    "array": "cpp",
-    "atomic": "cpp",
-    "bit": "cpp",
-    "bitset": "cpp",
-    "cctype": "cpp",
-    "charconv": "cpp",
-    "chrono": "cpp",
-    "clocale": "cpp",
-    "cmath": "cpp",
-    "compare": "cpp",
-    "concepts": "cpp",
-    "condition_variable": "cpp",
-    "csignal": "cpp",
-    "cstdarg": "cpp",
-    "cstddef": "cpp",
-    "cstdint": "cpp",
-    "cstdio": "cpp",
-    "cstdlib": "cpp",
-    "cstring": "cpp",
-    "ctime": "cpp",
-    "cwchar": "cpp",
-    "cwctype": "cpp",
-    "deque": "cpp",
-    "list": "cpp",
-    "map": "cpp",
-    "set": "cpp",
-    "string": "cpp",
-    "unordered_map": "cpp",
-    "vector": "cpp",
-    "exception": "cpp",
-    "algorithm": "cpp",
-    "functional": "cpp",
-    "iterator": "cpp",
-    "memory": "cpp",
-    "memory_resource": "cpp",
-    "numeric": "cpp",
-    "optional": "cpp",
-    "random": "cpp",
-    "ratio": "cpp",
-    "regex": "cpp",
-    "string_view": "cpp",
-    "system_error": "cpp",
-    "tuple": "cpp",
-    "type_traits": "cpp",
-    "utility": "cpp",
-    "format": "cpp",
-    "fstream": "cpp",
-    "future": "cpp",
-    "initializer_list": "cpp",
-    "iomanip": "cpp",
-    "iosfwd": "cpp",
-    "iostream": "cpp",
-    "istream": "cpp",
-    "limits": "cpp",
-    "mutex": "cpp",
-    "new": "cpp",
-    "numbers": "cpp",
-    "ostream": "cpp",
-    "ranges": "cpp",
-    "semaphore": "cpp",
-    "span": "cpp",
-    "sstream": "cpp",
-    "stdexcept": "cpp",
-    "stop_token": "cpp",
-    "streambuf": "cpp",
-    "text_encoding": "cpp",
-    "thread": "cpp",
-    "typeinfo": "cpp",
-    "variant": "cpp",
-    "queue": "cpp",
-    "stack": "cpp"
-  }
-}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,13 +3,45 @@ project(trigdx LANGUAGES CXX)

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)

-option(USE_MKL "Enable Intel MKL backend" OFF)
-option(USE_GPU "Enable GPU backend" OFF)
-option(USE_XSIMD "Enable XSIMD backend" OFF)
+option(TRIGDX_USE_MKL "Enable Intel MKL backend" OFF)
+option(TRIGDX_USE_GPU "Enable GPU backend" OFF)
+option(TRIGDX_USE_XSIMD "Enable XSIMD backend" OFF)
+option(TRIGDX_BUILD_TESTS "Build tests" ON)
+option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
+option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
+
+# Add compiler flags
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
+)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/trigdx/trigdx_config.hpp @ONLY)
+
+if(TRIGDX_BUILD_TESTS
+   OR TRIGDX_BUILD_BENCHMARKS
+   OR TRIGDX_BUILD_PYTHON)
+  include(FetchContent)
+endif()

 include_directories(${PROJECT_SOURCE_DIR}/include)

 add_subdirectory(src)
-add_subdirectory(tests)
-add_subdirectory(benchmarks)
+
+if(TRIGDX_BUILD_TESTS)
+  include(CTest)
+  enable_testing()
+  add_subdirectory(tests)
+endif()
+
+if(TRIGDX_BUILD_BENCHMARKS)
+  add_subdirectory(benchmarks)
+endif()
+
+if(TRIGDX_BUILD_PYTHON)
+  add_subdirectory(python)
+endif()
--- a/README.md
+++ b/README.md
@@ -0,0 +1,54 @@
+# TrigDx
+
+High‑performance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signal‑processing, and real‑time systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
+
+## Why TrigDx?
+Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
+
+- Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
+- Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
+- Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
+
+## Requirements
+- A C++ compiler with at least C++17 support (GCC, Clang)
+- CMake 3.15+
+- Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
+- Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
+
+## Building
+```bash
+git clone https://github.com/astron-rd/TrigDx.git
+cd TrigDx
+mkdir build && cd build
+
+# CPU-only:
+cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
+cmake --build . -j
+
+# Enable CUDA (if available):
+cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
+cmake --build . -j
+
+# Run tests:
+ctest --output-on-failure -j
+```
+
+Common CMake options:
+- `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
+- `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
+- `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
+- `TRIGDX_BUILD_PYTHON` — build Python interface.
+
+## Contributing
+- Fork → create a feature branch → open a PR.
+- Include unit tests for correctness‑sensitive changes and benchmark results for performance changes.
+- Follow project style (clang‑format) and run tests locally before submitting.
+
+## Reporting issues
+When opening an issue for incorrect results or performance regressions, please include:
+- Platform and CPU/GPU model.
+- Compiler and version with exact compile flags.
+- Small reproducer (input data and the TrigDx implementation used).
+  
+## License
+See the LICENSE file in the repository for licensing details.
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,23 +1,36 @@
+FetchContent_Declare(
+  benchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG v1.9.4)
+set(BENCHMARK_ENABLE_TESTING
+    OFF
+    CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(benchmark)
+
 add_executable(benchmark_reference benchmark_reference.cpp)
-target_link_libraries(benchmark_reference PRIVATE trigdx)
+target_link_libraries(benchmark_reference PRIVATE trigdx benchmark::benchmark)

 add_executable(benchmark_lookup benchmark_lookup.cpp)
-target_link_libraries(benchmark_lookup PRIVATE trigdx)
+target_link_libraries(benchmark_lookup PRIVATE trigdx benchmark::benchmark)

-add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
-target_link_libraries(benchmark_lookup_avx PRIVATE trigdx)
+if(HAVE_AVX)
+  add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
+  target_link_libraries(benchmark_lookup_avx PRIVATE trigdx
+                                                     benchmark::benchmark)
+endif()

-if(USE_MKL)
+if(TRIGDX_USE_MKL)
  add_executable(benchmark_mkl benchmark_mkl.cpp)
-  target_link_libraries(benchmark_mkl PRIVATE trigdx)
+  target_link_libraries(benchmark_mkl PRIVATE trigdx benchmark::benchmark)
 endif()

-if(USE_GPU)
+if(TRIGDX_USE_GPU)
  add_executable(benchmark_gpu benchmark_gpu.cpp)
-  target_link_libraries(benchmark_gpu PRIVATE trigdx gpu)
+  target_link_libraries(benchmark_gpu PRIVATE trigdx gpu benchmark::benchmark)
 endif()

-if(USE_XSIMD)
+if(TRIGDX_USE_XSIMD)
  add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp)
-  target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx)
+  target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx
+                                                       benchmark::benchmark)
 endif()
--- a/benchmarks/benchmark_gpu.cpp
+++ b/benchmarks/benchmark_gpu.cpp
@@ -1,9 +1,21 @@
-#include <trigdx/gpu.hpp>
+#include <trigdx/trigdx.hpp>

 #include "benchmark_utils.hpp"

-int main() {
-  benchmark_sinf<GPUBackend>();
-  benchmark_cosf<GPUBackend>();
-  benchmark_sincosf<GPUBackend>();
-}
+BENCHMARK_TEMPLATE(benchmark_sinf, GPUBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+BENCHMARK_TEMPLATE(benchmark_cosf, GPUBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+BENCHMARK_TEMPLATE(benchmark_sincosf, GPUBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+
+BENCHMARK_MAIN();
--- a/benchmarks/benchmark_lookup.cpp
+++ b/benchmarks/benchmark_lookup.cpp
@@ -1,13 +1,30 @@
-#include <trigdx/lookup.hpp>
+#include <trigdx/trigdx.hpp>

 #include "benchmark_utils.hpp"

-int main() {
-  benchmark_sinf<LookupBackend<16384>>();
-  benchmark_cosf<LookupBackend<16384>>();
-  benchmark_sincosf<LookupBackend<16384>>();
-
-  benchmark_sinf<LookupBackend<32768>>();
-  benchmark_cosf<LookupBackend<32768>>();
-  benchmark_sincosf<LookupBackend<32768>>();
+template <typename Backend> void register_benchmarks() {
+  BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
+  BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
+  BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
 }
+
+int main(int argc, char **argv) {
+  ::benchmark::Initialize(&argc, argv);
+
+  register_benchmarks<LookupBackend<16384>>();
+  register_benchmarks<LookupBackend<32768>>();
+
+  return ::benchmark::RunSpecifiedBenchmarks();
+}
--- a/benchmarks/benchmark_lookup_avx.cpp
+++ b/benchmarks/benchmark_lookup_avx.cpp
@@ -1,13 +1,30 @@
-#include <trigdx/lookup_avx.hpp>
+#include <trigdx/trigdx.hpp>

 #include "benchmark_utils.hpp"

-int main() {
-  benchmark_sinf<LookupAVXBackend<16384>>();
-  benchmark_cosf<LookupAVXBackend<16384>>();
-  benchmark_sincosf<LookupAVXBackend<16384>>();
-
-  benchmark_sinf<LookupAVXBackend<32768>>();
-  benchmark_cosf<LookupAVXBackend<32768>>();
-  benchmark_sincosf<LookupAVXBackend<32768>>();
+template <typename Backend> void register_benchmarks() {
+  BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
+  BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
+  BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
 }
+
+int main(int argc, char **argv) {
+  ::benchmark::Initialize(&argc, argv);
+
+  register_benchmarks<LookupAVXBackend<16384>>();
+  register_benchmarks<LookupAVXBackend<32768>>();
+
+  return ::benchmark::RunSpecifiedBenchmarks();
+}
--- a/benchmarks/benchmark_lookup_xsimd.cpp
+++ b/benchmarks/benchmark_lookup_xsimd.cpp
@@ -1,13 +1,30 @@
-#include <trigdx/lookup_xsimd.hpp>
+#include <trigdx/trigdx.hpp>

 #include "benchmark_utils.hpp"

-int main() {
-  benchmark_sinf<LookupXSIMDBackend<16384>>();
-  benchmark_cosf<LookupXSIMDBackend<16384>>();
-  benchmark_sincosf<LookupXSIMDBackend<16384>>();
-
-  benchmark_sinf<LookupXSIMDBackend<32768>>();
-  benchmark_cosf<LookupXSIMDBackend<32768>>();
-  benchmark_sincosf<LookupXSIMDBackend<32768>>();
+template <typename Backend> void register_benchmarks() {
+  BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
+  BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
+  BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
+      ->Unit(benchmark::kMillisecond)
+      ->Arg(1e5)
+      ->Arg(1e6)
+      ->Arg(1e7);
 }
+
+int main(int argc, char **argv) {
+  ::benchmark::Initialize(&argc, argv);
+
+  register_benchmarks<LookupXSIMDBackend<16384>>();
+  register_benchmarks<LookupXSIMDBackend<32768>>();
+
+  return ::benchmark::RunSpecifiedBenchmarks();
+}
--- a/benchmarks/benchmark_mkl.cpp
+++ b/benchmarks/benchmark_mkl.cpp
@@ -1,9 +1,21 @@
-#include <trigdx/mkl.hpp>
+#include <trigdx/trigdx.hpp>

 #include "benchmark_utils.hpp"

-int main() {
-  benchmark_sinf<MKLBackend>();
-  benchmark_cosf<MKLBackend>();
-  benchmark_sincosf<MKLBackend>();
-}
+BENCHMARK_TEMPLATE(benchmark_sinf, MKLBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+BENCHMARK_TEMPLATE(benchmark_cosf, MKLBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+BENCHMARK_TEMPLATE(benchmark_sincosf, MKLBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+
+BENCHMARK_MAIN();
--- a/benchmarks/benchmark_reference.cpp
+++ b/benchmarks/benchmark_reference.cpp
@@ -1,9 +1,21 @@
-#include <trigdx/reference.hpp>
+#include <trigdx/trigdx.hpp>

 #include "benchmark_utils.hpp"

-int main() {
-  benchmark_sinf<ReferenceBackend>();
-  benchmark_cosf<ReferenceBackend>();
-  benchmark_sincosf<ReferenceBackend>();
-}
+BENCHMARK_TEMPLATE(benchmark_sinf, ReferenceBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+BENCHMARK_TEMPLATE(benchmark_cosf, ReferenceBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+BENCHMARK_TEMPLATE(benchmark_sincosf, ReferenceBackend)
+    ->Unit(benchmark::kMillisecond)
+    ->Arg(1e5)
+    ->Arg(1e6)
+    ->Arg(1e7);
+
+BENCHMARK_MAIN();
--- a/benchmarks/benchmark_utils.hpp
+++ b/benchmarks/benchmark_utils.hpp
@@ -1,76 +1,122 @@
 #pragma once

 #include <chrono>
-#include <iomanip>
-#include <iostream>
+#include <cmath>
+#include <stdexcept>
+#include <string>
 #include <vector>

-const size_t N = 1e7;
+#include <benchmark/benchmark.h>

-inline void report(const std::string &name, double sec, double throughput) {
-  std::ios state(nullptr);
-  state.copyfmt(std::cout);
-  std::cout << std::setw(7) << name << " -> ";
-  std::cout << "time: ";
-  std::cout << std::fixed << std::setprecision(3) << std::setfill('0');
-  std::cout << sec << " s, ";
-  std::cout << "throughput: " << throughput << " M elems/sec\n";
-  std::cout.copyfmt(state);
+void init_x(float *x, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    x[i] = (i % 360) * 0.0174533f; // degrees to radians
+  }
 }

-template <typename Backend> inline void benchmark_sinf() {
-  std::vector<float> x(N), s(N);
-
-  for (size_t i = 0; i < N; ++i)
-    x[i] = (i % 360) * 0.0174533f; // degrees to radians
+template <typename Backend>
+static void benchmark_sinf(benchmark::State &state) {
+  const size_t N = static_cast<size_t>(state.range(0));

  Backend backend;
-  backend.init(N);

  auto start = std::chrono::high_resolution_clock::now();
-  backend.compute_sinf(N, x.data(), s.data());
+  backend.init(N);
+  float *x =
+      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
+  float *s =
+      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
  auto end = std::chrono::high_resolution_clock::now();
+  state.counters["init_ms"] =
+      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+          .count() /
+      1.e3;

-  double sec = std::chrono::duration<double>(end - start).count();
-  double throughput = N / sec / 1e6;
+  init_x(x, N);

-  report("sinf", sec, throughput);
+  for (auto _ : state) {
+    backend.compute_sinf(N, x, s);
+    benchmark::DoNotOptimize(s);
+  }
+
+  backend.free_memory(x);
+  backend.free_memory(s);
+
+  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
+                          static_cast<int64_t>(N));
 }

-template <typename Backend> inline void benchmark_cosf() {
-  std::vector<float> x(N), c(N);
-
-  for (size_t i = 0; i < N; ++i)
-    x[i] = (i % 360) * 0.0174533f; // degrees to radians
+template <typename Backend>
+static void benchmark_cosf(benchmark::State &state) {
+  const size_t N = static_cast<size_t>(state.range(0));

  Backend backend;
-  backend.init(N);

  auto start = std::chrono::high_resolution_clock::now();
-  backend.compute_cosf(N, x.data(), c.data());
+  backend.init(N);
+  float *x =
+      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
+  float *c =
+      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
+
+  if (!x || !c) {
+    throw std::runtime_error("Buffer allocation failed");
+  }
  auto end = std::chrono::high_resolution_clock::now();
+  state.counters["init_ms"] =
+      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+          .count() /
+      1.e3;

-  double sec = std::chrono::duration<double>(end - start).count();
-  double throughput = N / sec / 1e6;
+  init_x(x, N);

-  report("cosf", sec, throughput);
+  for (auto _ : state) {
+    backend.compute_cosf(N, x, c);
+    benchmark::DoNotOptimize(c);
+  }
+
+  backend.free_memory(x);
+  backend.free_memory(c);
+
+  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
+                          static_cast<int64_t>(N));
 }

-template <typename Backend> inline void benchmark_sincosf() {
-  std::vector<float> x(N), s(N), c(N);
-
-  for (size_t i = 0; i < N; ++i)
-    x[i] = (i % 360) * 0.0174533f; // degrees to radians
+template <typename Backend>
+static void benchmark_sincosf(benchmark::State &state) {
+  const size_t N = static_cast<size_t>(state.range(0));

  Backend backend;
-  backend.init(N);

  auto start = std::chrono::high_resolution_clock::now();
-  backend.compute_sincosf(N, x.data(), s.data(), c.data());
+  backend.init(N);
+  float *x =
+      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
+  float *s =
+      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
+  float *c =
+      reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
+  if (!x || !s || !c) {
+    throw std::runtime_error("Buffer allocation failed");
+  }
  auto end = std::chrono::high_resolution_clock::now();
+  state.counters["init_ms"] =
+      std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+          .count() /
+      1.e3;

-  double sec = std::chrono::duration<double>(end - start).count();
-  double throughput = N / sec / 1e6;
+  init_x(x, N);

-  report("sincosf", sec, throughput);
+  for (auto _ : state) {
+    backend.compute_sincosf(N, x, s, c);
+    benchmark::DoNotOptimize(s);
+    benchmark::DoNotOptimize(c);
+  }
+
+  backend.free_memory(x);
+  backend.free_memory(s);
+  backend.free_memory(c);
+
+  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
+                          static_cast<int64_t>(N));
 }
--- a/cmake/FindAVX.cmake
+++ b/cmake/FindAVX.cmake
@@ -0,0 +1,42 @@
+include(CheckCXXSourceRuns)
+
+set(SUPPORTED_COMPILERS Clang;GNU;Intel;IntelLLVM)
+
+if(CMAKE_CXX_COMPILER_ID IN_LIST SUPPORTED_COMPILERS)
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    set(CMAKE_REQUIRED_FLAGS "-xHost") # ICC
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+    set(CMAKE_REQUIRED_FLAGS "-march=native") # ICX
+  else()
+    set(CMAKE_REQUIRED_FLAGS "-march=native") # GCC/Clang
+  endif()
+else()
+  message(FATAL_ERROR "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}.")
+endif()
+
+# AVX check
+check_cxx_source_runs(
+  "
+#include <immintrin.h>
+int main() {
+    __m256 a = _mm256_setzero_ps(); // AVX
+    (void) a;
+    return 0;
+}
+"
+  HAVE_AVX)
+
+if(HAVE_AVX)
+  # AVX2 check
+  check_cxx_source_runs(
+    "
+#include <immintrin.h>
+int main() {
+    __m256i a = _mm256_set1_epi32(-1);
+    __m256i b = _mm256_abs_epi32(a); // AVX2
+    (void) b;
+    return 0;
+}
+"
+    HAVE_AVX2)
+endif()
--- a/cmake/trigdx_config.hpp.in
+++ b/cmake/trigdx_config.hpp.in
@@ -0,0 +1,5 @@
+#pragma once
+
+#cmakedefine TRIGDX_USE_MKL
+#cmakedefine TRIGDX_USE_GPU
+#cmakedefine TRIGDX_USE_XSIMD
--- a/include/trigdx/gpu.hpp
+++ b/include/trigdx/gpu.hpp
@@ -11,7 +11,8 @@ public:
  GPUBackend();
  ~GPUBackend() override;

-  void init(size_t n = 0) override;
+  void *allocate_memory(size_t bytes) const override;
+  void free_memory(void *ptr) const override;
  void compute_sinf(size_t n, const float *x, float *s) const override;
  void compute_cosf(size_t n, const float *x, float *c) const override;
  void compute_sincosf(size_t n, const float *x, float *s,
--- a/include/trigdx/interface.hpp
+++ b/include/trigdx/interface.hpp
@@ -1,6 +1,8 @@
 #pragma once

 #include <cstddef>
+#include <cstdint>
+#include <cstdlib>

 // Base interface for all math backends
 class Backend {
@@ -10,6 +12,12 @@ public:
  // Optional initialization
  virtual void init(size_t n = 0) {}

+  virtual void *allocate_memory(size_t bytes) const {
+    return static_cast<void *>(new uint8_t[bytes]);
+  };
+
+  virtual void free_memory(void *ptr) const { std::free(ptr); };
+
  // Compute sine for n elements
  virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;

--- a/include/trigdx/trigdx.hpp
+++ b/include/trigdx/trigdx.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <trigdx/trigdx_config.hpp>
+
+#include <trigdx/lookup.hpp>
+#include <trigdx/lookup_avx.hpp>
+#include <trigdx/reference.hpp>
+
+#if defined(TRIGDX_USE_MKL)
+#include <trigdx/mkl.hpp>
+#endif
+
+#if defined(TRIGDX_USE_GPU)
+#include <trigdx/gpu.hpp>
+#endif
+
+#if defined(TRIGDX_USE_XSIMD)
+#include <trigdx/lookup_xsimd.hpp>
+#endif
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -0,0 +1,23 @@
+find_package(pybind11 CONFIG QUIET)
+
+if(NOT pybind11_FOUND)
+  FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG v3.0.0)
+  FetchContent_MakeAvailable(pybind11)
+endif()
+
+# Needed to set ${Python_VERSION_MAJOR} and ${Python_VERSION_MINOR}
+find_package(Python REQUIRED)
+
+pybind11_add_module(pytrigdx bindings.cpp)
+target_link_libraries(pytrigdx PRIVATE trigdx)
+set_target_properties(pytrigdx PROPERTIES OUTPUT_NAME "trigdx")
+
+set(PYTHON_SITE_PACKAGES
+    "${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/trigdx"
+)
+
+install(TARGETS pytrigdx DESTINATION ${PYTHON_SITE_PACKAGES})
+install(FILES __init__.py DESTINATION ${PYTHON_SITE_PACKAGES})
--- a/python/init.py
+++ b/python/init.py
@@ -0,0 +1,16 @@
+from .trigdx import Reference, Lookup16K, Lookup32K, LookupAVX16K, LookupAVX32K
+
+try:
+    from .trigdx import MKL
+except ImportError:
+    pass
+
+try:
+    from .trigdx import GPU
+except ImportError:
+    pass
+
+try:
+    from .trigdx import LookupXSIMD16K, LookupXSIMD32K
+except ImportError:
+    pass
--- a/python/bindings.cpp
+++ b/python/bindings.cpp
@@ -0,0 +1,96 @@
+#include <memory>
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <trigdx/trigdx.hpp>
+
+namespace py = pybind11;
+
+template <typename T>
+py::array_t<T>
+compute_sin(const Backend &backend,
+            py::array_t<T, py::array::c_style | py::array::forcecast> x) {
+  const size_t n = x.shape(0);
+  if (x.ndim() != 1) {
+    throw py::value_error("Input array must be 1-dimensional");
+  }
+
+  const T *x_ptr = x.data();
+
+  py::array_t<float> s(n);
+  T *s_ptr = s.mutable_data();
+
+  backend.compute_sinf(n, x_ptr, s_ptr);
+
+  return s;
+}
+
+template <typename T>
+py::array_t<T>
+compute_cos(const Backend &backend,
+            py::array_t<T, py::array::c_style | py::array::forcecast> x) {
+  const size_t n = x.shape(0);
+  if (x.ndim() != 1) {
+    throw py::value_error("Input array must be 1-dimensional");
+  }
+
+  const T *x_ptr = x.data();
+
+  py::array_t<T> c(n);
+  T *c_ptr = c.mutable_data();
+
+  backend.compute_cosf(n, x_ptr, c_ptr);
+
+  return c;
+}
+
+template <typename T>
+std::tuple<py::array_t<T>, py::array_t<T>>
+compute_sincos(const Backend &backend,
+               py::array_t<T, py::array::c_style | py::array::forcecast> x) {
+  const size_t n = x.shape(0);
+  if (x.ndim() != 1) {
+    throw py::value_error("Input array must be 1-dimensional");
+  }
+
+  const T *x_ptr = x.data();
+
+  py::array_t<T> s(n);
+  py::array_t<T> c(n);
+
+  backend.compute_sincosf(n, x_ptr, s.mutable_data(), c.mutable_data());
+
+  return std::make_tuple(s, c);
+}
+
+template <typename BackendType>
+void bind_backend(py::module &m, const char *name) {
+  py::class_<BackendType, Backend, std::shared_ptr<BackendType>>(m, name)
+      .def(py::init<>())
+      .def("compute_sinf", &compute_sin<float>)
+      .def("compute_cosf", &compute_cos<float>)
+      .def("compute_sincosf", &compute_sincos<float>);
+}
+
+PYBIND11_MODULE(trigdx, m) {
+  m.doc() = "TrigDx python bindings";
+
+  py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend")
+      .def("init", &Backend::init);
+
+  bind_backend<ReferenceBackend>(m, "Reference");
+  bind_backend<LookupBackend<16384>>(m, "Lookup16K");
+  bind_backend<LookupBackend<32768>>(m, "Lookup32K");
+  bind_backend<LookupAVXBackend<16384>>(m, "LookupAVX16K");
+  bind_backend<LookupAVXBackend<32768>>(m, "LookupAVX32K");
+#if defined(TRIGDX_USE_MKL)
+  bind_backend<MKLBackend>(m, "MKL");
+#endif
+#if defined(TRIGDX_USE_GPU)
+  bind_backend<GPUBackend>(m, "GPU");
+#endif
+#if defined(TRIGDX_USE_XSIMD)
+  bind_backend<LookupXSIMDBackend<16384>>(m, "LookupXSIMD16K");
+  bind_backend<LookupXSIMDBackend<32768>>(m, "LookupXSIMD32K");
+#endif
+}
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,16 +1,20 @@
-add_library(trigdx reference.cpp lookup.cpp lookup_avx.cpp)
+include(FetchContent)
+include(FindAVX)
+add_library(trigdx reference.cpp lookup.cpp)

 target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)

-target_compile_options(trigdx PRIVATE -O3 -march=native)
+if(HAVE_AVX)
+  target_sources(trigdx PRIVATE lookup_avx.cpp)
+endif()

-if(USE_MKL)
+if(TRIGDX_USE_MKL)
  find_package(MKL REQUIRED)
  target_sources(trigdx PRIVATE mkl.cpp)
  target_link_libraries(trigdx PRIVATE MKL::MKL)
 endif()

-if(USE_GPU)
+if(TRIGDX_USE_GPU)
  enable_language(CUDA)
  find_package(CUDAToolkit REQUIRED)
  add_library(gpu SHARED gpu/gpu.cu)
@@ -19,8 +23,20 @@ if(USE_GPU)
  target_link_libraries(trigdx PRIVATE gpu)
 endif()

-if(USE_XSIMD)
-  find_package(xsimd REQUIRED)
+if(TRIGDX_USE_XSIMD)
+  # Requires XSIMD > 13 for architecture independent dispatching
+  find_package(xsimd 13 QUIET)
+  if(NOT TARGET xsimd)
+    FetchContent_Declare(
+      xsimd
+      GIT_REPOSITORY https://github.com/xtensor-stack/xsimd.git
+      GIT_TAG 13.2.0)
+    FetchContent_MakeAvailable(xsimd)
+  endif()
  target_sources(trigdx PRIVATE lookup_xsimd.cpp)
  target_link_libraries(trigdx PRIVATE xsimd)
 endif()
+
+target_include_directories(
+  trigdx INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+                   $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>)
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -10,75 +10,63 @@

 struct GPUBackend::Impl {

-  ~Impl() {
-    if (h_x) {
-      cudaFreeHost(h_x);
-    }
-    if (h_s) {
-      cudaFreeHost(h_s);
-    }
-    if (h_c) {
-      cudaFreeHost(h_c);
-    }
-    if (d_x) {
-      cudaFree(d_x);
-    }
-    if (d_s) {
-      cudaFree(d_s);
-    }
-    if (d_c) {
-      cudaFree(d_c);
-    }
+  void *allocate_memory(size_t bytes) const {
+    void *ptr;
+    cudaMallocHost(&ptr, bytes);
+    return ptr;
  }

-  void init(size_t n) {
-    const size_t bytes = n * sizeof(float);
-    cudaMallocHost(&h_x, bytes);
-    cudaMallocHost(&h_s, bytes);
-    cudaMallocHost(&h_c, bytes);
-    cudaMalloc(&d_x, bytes);
-    cudaMalloc(&d_s, bytes);
-    cudaMalloc(&d_c, bytes);
-  }
+  void free_memory(void *ptr) const { cudaFreeHost(ptr); }

  void compute_sinf(size_t n, const float *x, float *s) const {
    const size_t bytes = n * sizeof(float);
-    std::memcpy(h_x, x, bytes);
-    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
-    launch_sincosf_kernel(d_x, d_s, d_c, n);
-    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
-    std::memcpy(s, h_s, bytes);
+    float *d_x, *d_s;
+    cudaMalloc(&d_x, bytes);
+    cudaMalloc(&d_s, bytes);
+    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
+    launch_sinf_kernel(d_x, d_s, n);
+    cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaFree(d_x);
+    cudaFree(d_s);
  }

  void compute_cosf(size_t n, const float *x, float *c) const {
    const size_t bytes = n * sizeof(float);
-    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
-    launch_sincosf_kernel(d_x, d_s, d_c, n);
-    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
-    std::memcpy(c, h_c, bytes);
+    float *d_x, *d_c;
+    cudaMalloc(&d_x, bytes);
+    cudaMalloc(&d_c, bytes);
+    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
+    launch_cosf_kernel(d_x, d_c, n);
+    cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
+    cudaFree(d_x);
+    cudaFree(d_c);
  }

  void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
    const size_t bytes = n * sizeof(float);
-    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    float *d_x, *d_s, *d_c;
+    cudaMalloc(&d_x, bytes);
+    cudaMalloc(&d_s, bytes);
+    cudaMalloc(&d_c, bytes);
+    cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
    launch_sincosf_kernel(d_x, d_s, d_c, n);
-    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
+    cudaFree(d_x);
+    cudaFree(d_s);
+    cudaFree(d_c);
  }
-
-  float *h_x = nullptr;
-  float *h_s = nullptr;
-  float *h_c = nullptr;
-  float *d_x = nullptr;
-  float *d_s = nullptr;
-  float *d_c = nullptr;
 };

 GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}

 GPUBackend::~GPUBackend() = default;

-void GPUBackend::init(size_t n) { impl->init(n); }
+void *GPUBackend::allocate_memory(size_t bytes) const {
+  return impl->allocate_memory(bytes);
+}
+
+void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }

 void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
  impl->compute_sinf(n, x, s);
--- a/src/lookup_avx.cpp
+++ b/src/lookup_avx.cpp
@@ -160,7 +160,6 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
    for (std::size_t i = 0; i < n; ++i) {
      std::size_t idx = static_cast<std::size_t>(x[i] * SCALE) & MASK;
      std::size_t idx_cos = (idx + NR_SAMPLES / 4) & MASK;
-      s[i] = lookup[idx];
      c[i] = lookup[idx_cos];
    }
 #endif
--- a/src/lookup_xsimd.cpp
+++ b/src/lookup_xsimd.cpp
@@ -8,12 +8,20 @@
 template <std::size_t NR_SAMPLES> struct lookup_table {
  static constexpr std::size_t MASK = NR_SAMPLES - 1;
  static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI));
-  lookup_table() : values{} {
+  static constexpr float PI_FRAC = 2.0f * M_PIf32 / NR_SAMPLES;
+  static constexpr float TERM1 = 1.0f;         // 1
+  static constexpr float TERM2 = 0.5f;         // 1/2!
+  static constexpr float TERM3 = 1.0f / 6.0f;  // 1/3!
+  static constexpr float TERM4 = 1.0f / 24.0f; // 1/4!
+
+  constexpr lookup_table() : sin_values{}, cos_values{} {
    for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) {
-      values[i] = sinf(i * (2.0f * float(M_PI) / NR_SAMPLES));
+      sin_values[i] = sinf(i * PI_FRAC);
+      cos_values[i] = cosf(i * PI_FRAC);
    }
  }
-  std::array<float, NR_SAMPLES> values;
+  std::array<float, NR_SAMPLES> sin_values;
+  std::array<float, NR_SAMPLES> cos_values;
 };

 template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
@@ -25,28 +33,56 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {

    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
-    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
+    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);

-    const m_type quarter_pi = m_type::broadcast(Q_PI);
+    const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
+    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
+    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
+    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
+
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
-      m_type idx_cos = xsimd::add(idx, quarter_pi);
-      idx_cos = xsimd::bitwise_and(idx_cos, mask);
-      const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
+      const b_type f_idx = xsimd::to_float(idx);
+      idx = xsimd::bitwise_and(idx, mask);
+
+      b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
+      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
+
+      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
+      const b_type dx2 = xsimd::mul(dx, dx);
+      const b_type dx3 = xsimd::mul(dx2, dx);
+      const b_type dx4 = xsimd::mul(dx2, dx);
+      const b_type t2 = xsimd::mul(dx2, term2);
+      const b_type t3 = xsimd::mul(dx3, term3);
+      const b_type t4 = xsimd::mul(dx4, term4);
+
+      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
+
+      const b_type sindx = xsimd::sub(dx, t3);
+
+      cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));

      cosv.store(c + i, Tag());
    }
    for (; i < n; i++) {
-      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
-                        lookup_table_.MASK;
-      std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
+      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);

-      c[i] = lookup_table_.values[idx_cos];
+      std::size_t masked = idx & lookup_table_.MASK;
+      const float cosv = lookup_table_.cos_values[masked];
+      const float sinv = lookup_table_.sin_values[masked];
+      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
+      const float dx2 = dx * dx;
+      const float dx3 = dx2 * dx;
+      const float dx4 = dx3 * dx;
+      const float cosdx =
+          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
+      const float sindx = dx - lookup_table_.TERM3 * dx3;
+      c[i] = cosv * cosdx - sinv * sindx;
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
@@ -61,25 +97,53 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {

    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
-    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
+    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);

-    const m_type quarter_pi = m_type::broadcast(Q_PI);
+    const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
+    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
+    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
+    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
+
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
-      idx = xsimd::bitwise_and(idx, mask);
-      const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
+      b_type f_idx = xsimd::to_float(idx);
+      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
+      const b_type dx2 = xsimd::mul(dx, dx);
+      const b_type dx3 = xsimd::mul(dx2, dx);
+      const b_type dx4 = xsimd::mul(dx2, dx);
+      const b_type t2 = xsimd::mul(dx2, term2);
+      const b_type t3 = xsimd::mul(dx3, term3);
+      const b_type t4 = xsimd::mul(dx4, term4);

+      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
+      const b_type sindx = xsimd::sub(dx, t3);
+
+      idx = xsimd::bitwise_and(idx, mask);
+      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
+      const b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
+
+      sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
      sinv.store(s + i, Tag());
    }
    for (; i < n; i++) {
-      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
-                        lookup_table_.MASK;
-      s[i] = lookup_table_.values[idx];
+      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
+      std::size_t masked = idx & lookup_table_.MASK;
+      const float cosv = lookup_table_.cos_values[masked];
+      const float sinv = lookup_table_.sin_values[masked];
+      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
+      const float dx2 = dx * dx;
+      const float dx3 = dx2 * dx;
+      const float dx4 = dx3 * dx;
+      const float cosdx =
+          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
+      const float sindx = dx - lookup_table_.TERM3 * dx3;
+
+      s[i] = sinv * cosdx + cosv * sindx;
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
@@ -94,31 +158,56 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {

    constexpr uint_fast32_t VL = b_type::size;
    const uint_fast32_t VS = n - n % VL;
-    const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
    const b_type scale = b_type::broadcast(lookup_table_.SCALE);
    const m_type mask = m_type::broadcast(lookup_table_.MASK);
+    const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
+
+    const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
+    const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
+    const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
+    const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!

-    const m_type quarter_pi = m_type::broadcast(Q_PI);
    uint_fast32_t i;
    for (i = 0; i < VS; i += VL) {
      const b_type vx = b_type::load(a + i, Tag());
      const b_type scaled = xsimd::mul(vx, scale);
      m_type idx = xsimd::to_int(scaled);
-      m_type idx_cos = xsimd::add(idx, quarter_pi);
+      b_type f_idx = xsimd::to_float(idx);
+      const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
+      const b_type dx2 = xsimd::mul(dx, dx);
+      const b_type dx3 = xsimd::mul(dx2, dx);
+      const b_type dx4 = xsimd::mul(dx2, dx);
+      const b_type t2 = xsimd::mul(dx2, term2);
+      const b_type t3 = xsimd::mul(dx3, term3);
+      const b_type t4 = xsimd::mul(dx4, term4);
+
      idx = xsimd::bitwise_and(idx, mask);
-      idx_cos = xsimd::bitwise_and(idx_cos, mask);
-      const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
-      const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
+      b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
+      b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
+
+      const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
+      const b_type sindx = xsimd::sub(dx, t3);
+
+      sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
+      cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));

      sinv.store(s + i, Tag());
      cosv.store(c + i, Tag());
    }
    for (; i < n; i++) {
-      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
-                        lookup_table_.MASK;
-      std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
-      s[i] = lookup_table_.values[idx];
-      c[i] = lookup_table_.values[idx_cos];
+      std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
+      std::size_t masked = idx & lookup_table_.MASK;
+      const float cosv = lookup_table_.cos_values[masked];
+      const float sinv = lookup_table_.sin_values[masked];
+      const float dx = a[i] - idx * lookup_table_.PI_FRAC;
+      const float dx2 = dx * dx;
+      const float dx3 = dx2 * dx;
+      const float dx4 = dx3 * dx;
+      const float cosdx =
+          1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
+      const float sindx = dx - lookup_table_.TERM3 * dx3;
+      s[i] = sinv * cosdx + cosv * sindx;
+      c[i] = cosv * cosdx - sinv * sindx;
    }
  }
  lookup_table<NR_SAMPLES> lookup_table_;
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,3 @@
-include(FetchContent)
-
 FetchContent_Declare(
  catch2
  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
@@ -9,31 +7,31 @@ FetchContent_MakeAvailable(catch2)
 # Lookup backend test
 add_executable(test_lookup test_lookup.cpp)
 target_link_libraries(test_lookup PRIVATE trigdx Catch2::Catch2WithMain)
-
-# LookupAVX backend test
-add_executable(test_lookup_avx test_lookup_avx.cpp)
-target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
-
-# MKL backend test
-if(USE_MKL)
-  add_executable(test_mkl test_mkl.cpp)
-  target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
-endif()
-
-include(CTest)
 add_test(NAME test_lookup COMMAND test_lookup)

-if(USE_MKL)
+# LookupAVX backend test
+if(HAVE_AVX)
+  add_executable(test_lookup_avx test_lookup_avx.cpp)
+  target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
+  add_test(NAME test_lookup_avx COMMAND test_lookup_avx)
+endif()
+
+# MKL backend test
+if(TRIGDX_USE_MKL)
+  add_executable(test_mkl test_mkl.cpp)
+  target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
  add_test(NAME test_mkl COMMAND test_mkl)
 endif()

-if(USE_GPU)
+# GPU backend test
+if(TRIGDX_USE_GPU)
  add_executable(test_gpu test_gpu.cpp)
  target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
  add_test(NAME test_gpu COMMAND test_gpu)
 endif()

-if(USE_XSIMD)
+# XSIMD backend test
+if(TRIGDX_USE_XSIMD)
  add_executable(test_lookup_xsimd test_lookup_xsimd.cpp)
  target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain)
  add_test(NAME test_lookup_xsimd COMMAND test_lookup_xsimd)
--- a/tests/test_gpu.cpp
+++ b/tests/test_gpu.cpp
@@ -3,8 +3,8 @@

 #include "test_utils.hpp"

-TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-1f); }
+TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-6f); }

-TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-1f); }
+TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-6f); }

-TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-1f); }
+TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-6f); }
--- a/tests/test_lookup_xsimd.cpp
+++ b/tests/test_lookup_xsimd.cpp
@@ -4,16 +4,16 @@
 #include "test_utils.hpp"

 TEST_CASE("sincosf") {
-  test_sincosf<LookupXSIMDBackend<16384>>(1e-2f);
-  test_sincosf<LookupXSIMDBackend<32768>>(1e-2f);
+  test_sincosf<LookupXSIMDBackend<16384>>(1e-6f);
+  test_sincosf<LookupXSIMDBackend<32768>>(1e-6f);
 }

 TEST_CASE("sinf") {
-  test_sinf<LookupXSIMDBackend<16384>>(1e-2f);
-  test_sinf<LookupXSIMDBackend<32768>>(1e-2f);
+  test_sinf<LookupXSIMDBackend<16384>>(1e-6f);
+  test_sinf<LookupXSIMDBackend<32768>>(1e-6f);
 }

 TEST_CASE("cosf") {
-  test_cosf<LookupXSIMDBackend<16384>>(1e-2f);
-  test_cosf<LookupXSIMDBackend<32768>>(1e-2f);
+  test_cosf<LookupXSIMDBackend<16384>>(1e-6f);
+  test_cosf<LookupXSIMDBackend<32768>>(1e-6f);
 }
--- a/tests/test_utils.hpp
+++ b/tests/test_utils.hpp
@@ -7,14 +7,17 @@
 #include <catch2/matchers/catch_matchers_floating_point.hpp>
 #include <trigdx/reference.hpp>

-const size_t N = 1e7;
+const size_t N = 1234;
+
+void init_x(std::vector<float> &x) {
+  for (size_t i = 0; i < x.size(); ++i) {
+    x[i] = (i % 360) * 0.0174533f; // degrees to radians
+  }
+}

 template <typename Backend> inline void test_sinf(float tol) {
  std::vector<float> x(N), s_ref(N), s(N);
-
-  for (size_t i = 0; i < N; ++i) {
-    x[i] = float(i) * 0.01f;
-  }
+  init_x(x);

  ReferenceBackend ref;
  Backend backend;
@@ -30,10 +33,7 @@ template <typename Backend> inline void test_sinf(float tol) {

 template <typename Backend> inline void test_cosf(float tol) {
  std::vector<float> x(N), c_ref(N), c(N);
-
-  for (size_t i = 0; i < N; ++i) {
-    x[i] = float(i) * 0.01f;
-  }
+  init_x(x);

  ReferenceBackend ref;
  Backend backend;
@@ -49,10 +49,7 @@ template <typename Backend> inline void test_cosf(float tol) {

 template <typename Backend> inline void test_sincosf(float tol) {
  std::vector<float> x(N), s_ref(N), c_ref(N), s(N), c(N);
-
-  for (size_t i = 0; i < N; ++i) {
-    x[i] = float(i) * 0.01f;
-  }
+  init_x(x);

  ReferenceBackend ref;
  Backend backend;
Author	SHA1	Message	Date
Wiebe van Breukelen	5f00c5d304	Add README.md (#38 ) * Add README.md * Improve README description of TrigDx library * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> * Apply suggestion from @mickveldhuis Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl> --------- Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>	2025-10-22 16:55:37 +02:00
Wiebe van Breukelen	f85e67e669	Fix compiler warnings (#37 )	2025-10-22 16:48:26 +02:00
mmancini-skao	76998a137a	Fix error in Taylor expansion (#36 ) Replaced term3 with term4 in calculations for t4.	2025-10-20 17:09:35 +02:00
Bram Veenboer	500d35070e	Fix formatting (#35 ) * Run pre-commit * Skip line-length check in cmake-lint	2025-10-10 09:19:18 +02:00
Dantali0n	bfe752433f	Fixes #30 , Add CMake steps to install python bindings (#31 )	2025-09-17 20:03:28 +02:00
Bram Veenboer	8fe8314905	Update GPU backend (#29 ) * Update GPU memory management * Add allocate_memory and free_memory	2025-09-03 09:16:28 +02:00
Wiebe van Breukelen	9d3af8c202	Fixed broken pybind11 target check (#28 ) * Fixed broken pybind11 target check * Update python/CMakeLists.txt Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com> --------- Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com>	2025-08-27 17:07:30 +02:00
Bram Veenboer	0f7fd06be8	Extend CI (#27 ) * Add build with Intel compiler * Switch to upload/download-artifacts that retain permissions	2025-08-21 15:15:25 +02:00
Bram Veenboer	112baf447b	Update the test workflow (#26 ) - Upgrade to v1.2 of slurm-action - Add extra element to matrix to test this new action - Add cleanup step - Simplify handling of artefacts - Execute tests directly (not with ctest)	2025-08-21 09:06:42 +02:00
Wiebe van Breukelen	a5ba99ff5f	Only fetch pybind11 when target not available (#25 )	2025-08-15 16:19:34 +02:00
Wiebe van Breukelen	77c55d6824	Change CMAKE_SOURCE_DIR to PROJECT_SOURCE_DIR (#24 )	2025-08-15 15:30:37 +02:00
Bram Veenboer	c85df5f69c	Add Taylor expansion to LookupXSIMD (#23 ) Co-authored-by: Wiebe van Breukelen <breukelen@astron.nl> Co-authored-by: mancini <mancini@astron.nl>	2025-08-15 10:53:15 +02:00
Bram Veenboer	9c17e90c77	Add AVX checks (#20 ) Co-authored-by: mancini <mancini@astron.nl>	2025-08-15 10:30:57 +02:00
Bram Veenboer	e755c1a454	Fix CI (#21 )	2025-08-15 10:29:48 +02:00
Bram Veenboer	b129d9ffaf	Update CI (#19 )	2025-08-15 10:12:55 +02:00
Bram Veenboer	428f60f6d6	Delete .vscode/settings.json (#18 )	2025-08-14 17:16:02 +02:00
Bram Veenboer	0e8ea57025	Add initial CI (#17 )	2025-08-14 17:14:29 +02:00
Bram Veenboer	a072ffd12f	Merge pull request #10 from astron-rd/add-python-interface Add Python interface	2025-08-14 15:21:13 +02:00
Bram Veenboer	b3467840f9	Use const size_t for n	2025-08-14 11:07:45 +02:00
Bram Veenboer	79dc7b4285	Add dimension checks	2025-08-14 11:05:31 +02:00
Bram Veenboer	97692cface	Use Type template for helper functions	2025-08-14 11:02:30 +02:00
Bram Veenboer	f40c44d5dd	Add Python interface	2025-08-14 10:57:06 +02:00
Wiebe van Breukelen	0cbefb77b7	Fix CMake interface include paths (#14 ) * Fixed include path * Formatting * Remove lookup_constexpr.hpp	2025-08-14 09:34:27 +02:00
Bram Veenboer	cdc94ab2cb	Merge pull request #11 from astron-rd/fix-cuda-test Bugfixes in data copies in CUDA backend	2025-08-12 17:16:10 +02:00
Bram Veenboer	33f98abc48	Set tolerance for CUDA tests to 1e-6	2025-08-12 17:14:41 +02:00
Bram Veenboer	c7ab463b43	Bugfix gpu output data copy	2025-08-12 17:14:41 +02:00
Bram Veenboer	ebb6d50c0b	Bugfix gpu input data copy	2025-08-12 17:14:41 +02:00
Bram Veenboer	5338f3e135	Cleanup data initialization of tests	2025-08-12 17:14:41 +02:00
Bram Veenboer	f6575599fd	Cleanup data initialization of benchmarks	2025-08-12 17:14:41 +02:00
Bram Veenboer	cd048e5581	Remove unused DEFAULT_N	2025-08-12 17:14:41 +02:00
Bram Veenboer	eb3806442d	Merge pull request #9 from astron-rd/cmake-bugfix Fix issue when building with TRIGDX_USE_GPU=1	2025-08-12 15:28:47 +02:00
Bram Veenboer	e58d6dae8d	Merge pull request #7 from astron-rd/use-google-benchmark Refactor benchmarks using Google Benchmark	2025-08-12 15:24:58 +02:00
Bram Veenboer	a65137322d	Use static_cast	2025-08-12 15:18:07 +02:00
Bram Veenboer	b936b3998e	Fix issue when building with TRIGDX_USE_GPU=1	2025-08-12 14:59:36 +02:00
Bram Veenboer	0e2d9862d5	Merge pull request #8 from astron-rd/use-common-headers Add trigdx_config.hpp and trigdx.hpp header files	2025-08-12 14:50:19 +02:00
Bram Veenboer	2f39c5c86e	Add trigdx_config.hpp and trigdx.hpp header files	2025-08-12 14:04:13 +02:00
Bram Veenboer	5e7aca89bb	Refactor benchmarks using Google Benchmark	2025-08-12 13:17:23 +02:00
Bram Veenboer	e9a74ef283	Merge pull request #6 from astron-rd/fix-tests-cmake Improve the CMake for the tests	2025-08-12 11:38:25 +02:00
Bram Veenboer	832da6229d	Improve the CMake	2025-08-12 11:28:43 +02:00
Wiebe van Breukelen	b71611ed17	Merge pull request #5 from astron-rd/auto-pull-xsimd Pull xsimd when not available	2025-08-12 10:29:35 +02:00
Wiebe van Breukelen	5a4e80ea4a	Pull xsimd when not available	2025-08-12 10:18:32 +02:00
Wiebe van Breukelen	13f18847bc	Merge pull request #3 from astron-rd/add-gitignore Add .gitignore	2025-08-12 10:11:04 +02:00
Wiebe van Breukelen	eee382307f	Merge pull request #2 from astron-rd/add-cmake-option-prefix Add TRIGDX prefix to CMake configurable options	2025-08-12 10:10:55 +02:00
Wiebe van Breukelen	d04fb8933d	Add TRIGDX prefix to CMake configurable options	2025-08-12 09:52:28 +02:00