8 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
807b9d5284 Fix dx4 calculation in scalar remainder code and add null checks
- Fix dx4 computation in scalar remainder loops (should be dx2*dx2)
- Add missing null pointer check in benchmark_sinf for consistency

Co-authored-by: wvbbreu <185333235+wvbbreu@users.noreply.github.com>
2025-10-29 16:26:27 +00:00
copilot-swe-agent[bot]
3addf2b05e Fix memory management bug and computational inefficiencies
- Fix mismatched new[]/free() to use proper delete[] in interface.hpp
- Fix incorrect dx4 calculation (should be dx2*dx2, not dx2*dx) in lookup_xsimd.cpp
- Fix redundant sinv calculation by using separate base variables in sin_cosf_dispatcher
- Remove unused variable declarations in lookup_avx.cpp
- Optimize reference backend to use sincosf() instead of separate sin/cos calls

Co-authored-by: wvbbreu <185333235+wvbbreu@users.noreply.github.com>
2025-10-29 16:23:35 +00:00
copilot-swe-agent[bot]
3eb537d586 Initial plan 2025-10-29 16:15:26 +00:00
Dantali0n
58bc640d6d 32: Set AVX and AVX2 flags using CMake checks (#34)
* 32: Set mavx and mavx2 based on CMake checks
* 32: Update flags for Intel compiler
* Fix: AVX2 instead of AVX__2

Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com>
Co-authored-by: lukken <lukken@astron.nl>
2025-10-29 09:18:43 +01:00
Wiebe van Breukelen
5f00c5d304 Add README.md (#38)
* Add README.md

* Improve README description of TrigDx library

* Apply suggestion from @mickveldhuis

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>

* Apply suggestion from @mickveldhuis

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>

* Apply suggestion from @mickveldhuis

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>

---------

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>
2025-10-22 16:55:37 +02:00
Wiebe van Breukelen
f85e67e669 Fix compiler warnings (#37) 2025-10-22 16:48:26 +02:00
mmancini-skao
76998a137a Fix error in Taylor expansion (#36)
Replaced term3 with term4 in calculations for t4.
2025-10-20 17:09:35 +02:00
Bram Veenboer
500d35070e Fix formatting (#35)
* Run pre-commit

* Skip line-length check in cmake-lint
2025-10-10 09:19:18 +02:00
8 changed files with 109 additions and 25 deletions

View File

@@ -12,6 +12,11 @@ option(TRIGDX_BUILD_TESTS "Build tests" ON)
option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON) option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
option(TRIGDX_BUILD_PYTHON "Build Python interface" ON) option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
# Add compiler flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
configure_file( configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in

54
README.md Normal file
View File

@@ -0,0 +1,54 @@
# TrigDx
Highperformance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signalprocessing, and realtime systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
## Why TrigDx?
Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
- Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
- Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
- Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
## Requirements
- A C++ compiler with at least C++17 support (GCC, Clang)
- CMake 3.15+
- Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
- Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
## Building
```bash
git clone https://github.com/astron-rd/TrigDx.git
cd TrigDx
mkdir build && cd build
# CPU-only:
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
cmake --build . -j
# Enable CUDA (if available):
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
cmake --build . -j
# Run tests:
ctest --output-on-failure -j
```
Common CMake options:
- `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
- `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
- `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
- `TRIGDX_BUILD_PYTHON` — build Python interface.
## Contributing
- Fork → create a feature branch → open a PR.
- Include unit tests for correctnesssensitive changes and benchmark results for performance changes.
- Follow project style (clangformat) and run tests locally before submitting.
## Reporting issues
When opening an issue for incorrect results or performance regressions, please include:
- Platform and CPU/GPU model.
- Compiler and version with exact compile flags.
- Small reproducer (input data and the TrigDx implementation used).
## License
See the LICENSE file in the repository for licensing details.

View File

@@ -26,6 +26,9 @@ static void benchmark_sinf(benchmark::State &state) {
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float))); reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s = float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float))); reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
if (!x || !s) {
throw std::runtime_error("Buffer allocation failed");
}
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] = state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start) std::chrono::duration_cast<std::chrono::microseconds>(end - start)

View File

@@ -16,7 +16,7 @@ public:
return static_cast<void *>(new uint8_t[bytes]); return static_cast<void *>(new uint8_t[bytes]);
}; };
virtual void free_memory(void *ptr) const { std::free(ptr); }; virtual void free_memory(void *ptr) const { delete[] static_cast<uint8_t*>(ptr); };
// Compute sine for n elements // Compute sine for n elements
virtual void compute_sinf(size_t n, const float *x, float *s) const = 0; virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;

View File

@@ -2,6 +2,24 @@ include(FetchContent)
include(FindAVX) include(FindAVX)
add_library(trigdx reference.cpp lookup.cpp) add_library(trigdx reference.cpp lookup.cpp)
if(HAVE_AVX2)
target_compile_definitions(trigdx PUBLIC HAVE_AVX2)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
"IntelLLVM")
target_compile_options(trigdx PUBLIC -xCORE-AVX2)
else()
target_compile_options(trigdx PUBLIC -mavx2)
endif()
elseif(HAVE_AVX)
target_compile_definitions(trigdx PUBLIC HAVE_AVX)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
"IntelLLVM")
target_compile_options(trigdx PUBLIC -xAVX)
else()
target_compile_options(trigdx PUBLIC -mavx)
endif()
endif()
target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include) target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
if(HAVE_AVX) if(HAVE_AVX)

View File

@@ -6,6 +6,16 @@
#include "trigdx/lookup_avx.hpp" #include "trigdx/lookup_avx.hpp"
#if defined(HAVE_AVX) && !defined(__AVX__)
static_assert(HAVE_AVX == 0, "__AVX__ should be defined when HAVE_AVX is "
"defined");
#endif
#if defined(HAVE_AVX2) && !defined(__AVX2__)
static_assert(HAVE_AVX2 == 0, "__AVX2__ should be defined when HAVE_AVX2 is "
"defined");
#endif
template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl { template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
std::vector<float> lookup; std::vector<float> lookup;
static constexpr std::size_t MASK = NR_SAMPLES - 1; static constexpr std::size_t MASK = NR_SAMPLES - 1;
@@ -79,7 +89,6 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
constexpr std::size_t VL = 8; // AVX processes 8 floats constexpr std::size_t VL = 8; // AVX processes 8 floats
const __m256 scale = _mm256_set1_ps(SCALE); const __m256 scale = _mm256_set1_ps(SCALE);
const __m256i mask = _mm256_set1_epi32(MASK); const __m256i mask = _mm256_set1_epi32(MASK);
const __m256i quarter_pi = _mm256_set1_epi32(NR_SAMPLES / 4);
std::size_t i = 0; std::size_t i = 0;
for (; i + VL <= n; i += VL) { for (; i + VL <= n; i += VL) {
@@ -94,7 +103,7 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
#else #else
// fallback gather for AVX1 // fallback gather for AVX1
float sin_tmp[VL]; float sin_tmp[VL];
int idx_a[VL], idxc_a[VL]; int idx_a[VL];
_mm256_store_si256((__m256i *)idx_a, idx); _mm256_store_si256((__m256i *)idx_a, idx);
for (std::size_t k = 0; k < VL; ++k) { for (std::size_t k = 0; k < VL; ++k) {
sin_tmp[k] = lookup[idx_a[k]]; sin_tmp[k] = lookup[idx_a[k]];

View File

@@ -20,8 +20,8 @@ template <std::size_t NR_SAMPLES> struct lookup_table {
cos_values[i] = cosf(i * PI_FRAC); cos_values[i] = cosf(i * PI_FRAC);
} }
} }
std::array<float, NR_SAMPLES> cos_values;
std::array<float, NR_SAMPLES> sin_values; std::array<float, NR_SAMPLES> sin_values;
std::array<float, NR_SAMPLES> cos_values;
}; };
template <std::size_t NR_SAMPLES> struct cosf_dispatcher { template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
@@ -33,7 +33,6 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC); const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
@@ -42,7 +41,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2! const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3! const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4! const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
@@ -57,10 +56,10 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac)); const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx); const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx); const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx); const b_type dx4 = xsimd::mul(dx2, dx2);
const b_type t2 = xsimd::mul(dx2, term2); const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3); const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term3); const b_type t4 = xsimd::mul(dx4, term4);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4); const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
@@ -79,7 +78,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
const float dx = a[i] - idx * lookup_table_.PI_FRAC; const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx; const float dx2 = dx * dx;
const float dx3 = dx2 * dx; const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx; const float dx4 = dx2 * dx2;
const float cosdx = const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4; 1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3; const float sindx = dx - lookup_table_.TERM3 * dx3;
@@ -98,7 +97,6 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC); const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
@@ -107,7 +105,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2! const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3! const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4! const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
@@ -117,10 +115,10 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac)); const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx); const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx); const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx); const b_type dx4 = xsimd::mul(dx2, dx2);
const b_type t2 = xsimd::mul(dx2, term2); const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3); const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term3); const b_type t4 = xsimd::mul(dx4, term4);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4); const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3); const b_type sindx = xsimd::sub(dx, t3);
@@ -140,7 +138,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
const float dx = a[i] - idx * lookup_table_.PI_FRAC; const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx; const float dx2 = dx * dx;
const float dx3 = dx2 * dx; const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx; const float dx4 = dx2 * dx2;
const float cosdx = const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4; 1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3; const float sindx = dx - lookup_table_.TERM3 * dx3;
@@ -160,7 +158,6 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC); const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
@@ -170,7 +167,6 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3! const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4! const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
@@ -180,20 +176,20 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac)); const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx); const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx); const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx); const b_type dx4 = xsimd::mul(dx2, dx2);
const b_type t2 = xsimd::mul(dx2, term2); const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3); const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term3); const b_type t4 = xsimd::mul(dx4, term4);
idx = xsimd::bitwise_and(idx, mask); idx = xsimd::bitwise_and(idx, mask);
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx); const b_type sinv_base = b_type::gather(lookup_table_.sin_values.data(), idx);
b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx); const b_type cosv_base = b_type::gather(lookup_table_.cos_values.data(), idx);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4); const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3); const b_type sindx = xsimd::sub(dx, t3);
sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx)); b_type sinv = xsimd::add(xsimd::mul(cosv_base, sindx), xsimd::mul(sinv_base, cosdx));
cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx)); b_type cosv = xsimd::sub(xsimd::mul(cosv_base, cosdx), xsimd::mul(sinv_base, sindx));
sinv.store(s + i, Tag()); sinv.store(s + i, Tag());
cosv.store(c + i, Tag()); cosv.store(c + i, Tag());
@@ -206,7 +202,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
const float dx = a[i] - idx * lookup_table_.PI_FRAC; const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx; const float dx2 = dx * dx;
const float dx3 = dx2 * dx; const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx; const float dx4 = dx2 * dx2;
const float cosdx = const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4; 1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3; const float sindx = dx - lookup_table_.TERM3 * dx3;

View File

@@ -17,7 +17,6 @@ void ReferenceBackend::compute_cosf(size_t n, const float *x, float *c) const {
void ReferenceBackend::compute_sincosf(size_t n, const float *x, float *s, void ReferenceBackend::compute_sincosf(size_t n, const float *x, float *s,
float *c) const { float *c) const {
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
s[i] = sinf(x[i]); sincosf(x[i], &s[i], &c[i]);
c[i] = cosf(x[i]);
} }
} }