44 Commits

Author SHA1 Message Date
Wiebe van Breukelen
5f00c5d304 Add README.md (#38)
* Add README.md

* Improve README description of TrigDx library

* Apply suggestion from @mickveldhuis

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>

* Apply suggestion from @mickveldhuis

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>

* Apply suggestion from @mickveldhuis

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>

---------

Co-authored-by: Mick Veldhuis <mickveldhuis@hotmail.nl>
2025-10-22 16:55:37 +02:00
Wiebe van Breukelen
f85e67e669 Fix compiler warnings (#37) 2025-10-22 16:48:26 +02:00
mmancini-skao
76998a137a Fix error in Taylor expansion (#36)
Replaced term3 with term4 in calculations for t4.
2025-10-20 17:09:35 +02:00
Bram Veenboer
500d35070e Fix formatting (#35)
* Run pre-commit

* Skip line-length check in cmake-lint
2025-10-10 09:19:18 +02:00
Dantali0n
bfe752433f Fixes #30, Add CMake steps to install python bindings (#31) 2025-09-17 20:03:28 +02:00
Bram Veenboer
8fe8314905 Update GPU backend (#29)
* Update GPU memory management
* Add allocate_memory and free_memory
2025-09-03 09:16:28 +02:00
Wiebe van Breukelen
9d3af8c202 Fixed broken pybind11 target check (#28)
* Fixed broken pybind11 target check

* Update python/CMakeLists.txt

Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com>

---------

Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com>
2025-08-27 17:07:30 +02:00
Bram Veenboer
0f7fd06be8 Extend CI (#27)
* Add build with Intel compiler
* Switch to upload/download-artifacts that retain permissions
2025-08-21 15:15:25 +02:00
Bram Veenboer
112baf447b Update the test workflow (#26)
- Upgrade to v1.2 of slurm-action
- Add extra element to matrix to test this new action
- Add cleanup step
- Simplify handling of artefacts
- Execute tests directly (not with ctest)
2025-08-21 09:06:42 +02:00
Wiebe van Breukelen
a5ba99ff5f Only fetch pybind11 when target not available (#25) 2025-08-15 16:19:34 +02:00
Wiebe van Breukelen
77c55d6824 Change CMAKE_SOURCE_DIR to PROJECT_SOURCE_DIR (#24) 2025-08-15 15:30:37 +02:00
Bram Veenboer
c85df5f69c Add Taylor expansion to LookupXSIMD (#23)
Co-authored-by: Wiebe van Breukelen <breukelen@astron.nl>
Co-authored-by: mancini <mancini@astron.nl>
2025-08-15 10:53:15 +02:00
Bram Veenboer
9c17e90c77 Add AVX checks (#20)
Co-authored-by: mancini <mancini@astron.nl>
2025-08-15 10:30:57 +02:00
Bram Veenboer
e755c1a454 Fix CI (#21) 2025-08-15 10:29:48 +02:00
Bram Veenboer
b129d9ffaf Update CI (#19) 2025-08-15 10:12:55 +02:00
Bram Veenboer
428f60f6d6 Delete .vscode/settings.json (#18) 2025-08-14 17:16:02 +02:00
Bram Veenboer
0e8ea57025 Add initial CI (#17) 2025-08-14 17:14:29 +02:00
Bram Veenboer
a072ffd12f Merge pull request #10 from astron-rd/add-python-interface
Add Python interface
2025-08-14 15:21:13 +02:00
Bram Veenboer
b3467840f9 Use const size_t for n 2025-08-14 11:07:45 +02:00
Bram Veenboer
79dc7b4285 Add dimension checks 2025-08-14 11:05:31 +02:00
Bram Veenboer
97692cface Use Type template for helper functions 2025-08-14 11:02:30 +02:00
Bram Veenboer
f40c44d5dd Add Python interface 2025-08-14 10:57:06 +02:00
Wiebe van Breukelen
0cbefb77b7 Fix CMake interface include paths (#14)
* Fixed include path

* Formatting

* Remove lookup_constexpr.hpp
2025-08-14 09:34:27 +02:00
Bram Veenboer
cdc94ab2cb Merge pull request #11 from astron-rd/fix-cuda-test
Bugfixes in data copies in CUDA backend
2025-08-12 17:16:10 +02:00
Bram Veenboer
33f98abc48 Set tolerance for CUDA tests to 1e-6 2025-08-12 17:14:41 +02:00
Bram Veenboer
c7ab463b43 Bugfix gpu output data copy 2025-08-12 17:14:41 +02:00
Bram Veenboer
ebb6d50c0b Bugfix gpu input data copy 2025-08-12 17:14:41 +02:00
Bram Veenboer
5338f3e135 Cleanup data initialization of tests 2025-08-12 17:14:41 +02:00
Bram Veenboer
f6575599fd Cleanup data initialization of benchmarks 2025-08-12 17:14:41 +02:00
Bram Veenboer
cd048e5581 Remove unused DEFAULT_N 2025-08-12 17:14:41 +02:00
Bram Veenboer
eb3806442d Merge pull request #9 from astron-rd/cmake-bugfix
Fix issue when building with TRIGDX_USE_GPU=1
2025-08-12 15:28:47 +02:00
Bram Veenboer
e58d6dae8d Merge pull request #7 from astron-rd/use-google-benchmark
Refactor benchmarks using Google Benchmark
2025-08-12 15:24:58 +02:00
Bram Veenboer
a65137322d Use static_cast 2025-08-12 15:18:07 +02:00
Bram Veenboer
b936b3998e Fix issue when building with TRIGDX_USE_GPU=1 2025-08-12 14:59:36 +02:00
Bram Veenboer
0e2d9862d5 Merge pull request #8 from astron-rd/use-common-headers
Add trigdx_config.hpp and trigdx.hpp header files
2025-08-12 14:50:19 +02:00
Bram Veenboer
2f39c5c86e Add trigdx_config.hpp and trigdx.hpp header files 2025-08-12 14:04:13 +02:00
Bram Veenboer
5e7aca89bb Refactor benchmarks using Google Benchmark 2025-08-12 13:17:23 +02:00
Bram Veenboer
e9a74ef283 Merge pull request #6 from astron-rd/fix-tests-cmake
Improve the CMake for the tests
2025-08-12 11:38:25 +02:00
Bram Veenboer
832da6229d Improve the CMake 2025-08-12 11:28:43 +02:00
Wiebe van Breukelen
b71611ed17 Merge pull request #5 from astron-rd/auto-pull-xsimd
Pull xsimd when not available
2025-08-12 10:29:35 +02:00
Wiebe van Breukelen
5a4e80ea4a Pull xsimd when not available 2025-08-12 10:18:32 +02:00
Wiebe van Breukelen
13f18847bc Merge pull request #3 from astron-rd/add-gitignore
Add .gitignore
2025-08-12 10:11:04 +02:00
Wiebe van Breukelen
eee382307f Merge pull request #2 from astron-rd/add-cmake-option-prefix
Add TRIGDX prefix to CMake configurable options
2025-08-12 10:10:55 +02:00
Wiebe van Breukelen
d04fb8933d Add TRIGDX prefix to CMake configurable options 2025-08-12 09:52:28 +02:00
30 changed files with 862 additions and 305 deletions

15
.github/workflows/linting.yml vendored Normal file
View File

@@ -0,0 +1,15 @@
name: linting
on:
push:
jobs:
format:
name: Linting
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- run: sudo apt-get update
- run: sudo apt-get install -y clang-format-14 cmake-format pre-commit
- run: pre-commit run -a

89
.github/workflows/test.yml vendored Normal file
View File

@@ -0,0 +1,89 @@
name: test
on:
push:
jobs:
build:
runs-on: [slurm]
strategy:
matrix: &matrix
include:
- name: GNU, NVIDIA A4000
partition: defq
gres: gpu:A4000
cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
environment_modules: "spack/20250403 intel-oneapi-mkl cuda python"
- name: Intel, NVIDIA A4000
partition: defq
gres: gpu:A4000
cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
environment_modules: "spack/20250403 intel-oneapi-compilers intel-oneapi-mkl cuda python"
- name: NVIDIA GH200
partition: ghq
gres: gpu:GH200
cmake_flags: "-DTRIGDX_USE_GPU=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=90a -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc"
env:
DEVICE_NAME: ${{ matrix.name }}
PARTITION_NAME: ${{ matrix.partition }}
CMAKE_FLAGS: ${{ matrix.cmake_flags }}
ENVIRONMENT_MODULES: ${{ matrix.environment_modules }}
steps:
- &cleanup
name: "Remove old workspace"
run: |
rm -rf $GITHUB_WORKSPACE/*
mkdir -p $GITHUB_WORKSPACE
- uses: actions/checkout@v4
- uses: astron-rd/slurm-action@v1.2
with:
partition: ${{ matrix.partition }}
gres: ${{ matrix.gres }}
commands: |
module load ${ENVIRONMENT_MODULES}
cmake -S . -B build ${CMAKE_FLAGS}
make -C build -j
- name: Upload build
uses: pyTooling/upload-artifact@v4
with:
name: build-${{ matrix.name }}
path: build
test:
runs-on: [slurm]
needs: build
strategy:
matrix: *matrix
env:
DEVICE_NAME: ${{ matrix.name }}
PARTITION_NAME: ${{ matrix.partition }}
steps:
- *cleanup
- uses: pyTooling/download-artifact@v4
with:
name: build-${{ matrix.name }}
- uses: astron-rd/slurm-action@v1.2
with:
partition: ${{ matrix.partition }}
gres: ${{ matrix.gres }}
commands: |
find build/tests -type f -executable -exec {} \;
benchmark:
runs-on: [slurm]
needs: build
strategy:
matrix: *matrix
env:
DEVICE_NAME: ${{ matrix.name }}
PARTITION_NAME: ${{ matrix.partition }}
steps:
- *cleanup
- uses: pyTooling/download-artifact@v4
with:
name: build-${{ matrix.name }}
- uses: astron-rd/slurm-action@v1.2
with:
partition: ${{ matrix.partition }}
gres: ${{ matrix.gres }}
commands: |
find build/benchmarks -type f -executable -exec {} \;

View File

@@ -7,4 +7,5 @@ repos:
rev: v0.6.13
hooks:
- id: cmake-format
- id: cmake-lint
- id: cmake-lint
args: [--disabled-codes=C0301]

77
.vscode/settings.json vendored
View File

@@ -1,77 +0,0 @@
{
"files.associations": {
"*.cc": "cpp",
"*.tpp": "cpp",
"array": "cpp",
"atomic": "cpp",
"bit": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"charconv": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"concepts": "cpp",
"condition_variable": "cpp",
"csignal": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"set": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"vector": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"random": "cpp",
"ratio": "cpp",
"regex": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"format": "cpp",
"fstream": "cpp",
"future": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"ranges": "cpp",
"semaphore": "cpp",
"span": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"text_encoding": "cpp",
"thread": "cpp",
"typeinfo": "cpp",
"variant": "cpp",
"queue": "cpp",
"stack": "cpp"
}
}

View File

@@ -3,13 +3,45 @@ project(trigdx LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
option(USE_MKL "Enable Intel MKL backend" OFF)
option(USE_GPU "Enable GPU backend" OFF)
option(USE_XSIMD "Enable XSIMD backend" OFF)
option(TRIGDX_USE_MKL "Enable Intel MKL backend" OFF)
option(TRIGDX_USE_GPU "Enable GPU backend" OFF)
option(TRIGDX_USE_XSIMD "Enable XSIMD backend" OFF)
option(TRIGDX_BUILD_TESTS "Build tests" ON)
option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
# Add compiler flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in
${CMAKE_CURRENT_BINARY_DIR}/include/trigdx/trigdx_config.hpp @ONLY)
if(TRIGDX_BUILD_TESTS
OR TRIGDX_BUILD_BENCHMARKS
OR TRIGDX_BUILD_PYTHON)
include(FetchContent)
endif()
include_directories(${PROJECT_SOURCE_DIR}/include)
add_subdirectory(src)
add_subdirectory(tests)
add_subdirectory(benchmarks)
if(TRIGDX_BUILD_TESTS)
include(CTest)
enable_testing()
add_subdirectory(tests)
endif()
if(TRIGDX_BUILD_BENCHMARKS)
add_subdirectory(benchmarks)
endif()
if(TRIGDX_BUILD_PYTHON)
add_subdirectory(python)
endif()

54
README.md Normal file
View File

@@ -0,0 +1,54 @@
# TrigDx
Highperformance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signalprocessing, and realtime systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
## Why TrigDx?
Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
- Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
- Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
- Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
## Requirements
- A C++ compiler with at least C++17 support (GCC, Clang)
- CMake 3.15+
- Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
- Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
## Building
```bash
git clone https://github.com/astron-rd/TrigDx.git
cd TrigDx
mkdir build && cd build
# CPU-only:
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
cmake --build . -j
# Enable CUDA (if available):
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
cmake --build . -j
# Run tests:
ctest --output-on-failure -j
```
Common CMake options:
- `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
- `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
- `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
- `TRIGDX_BUILD_PYTHON` — build Python interface.
## Contributing
- Fork → create a feature branch → open a PR.
- Include unit tests for correctnesssensitive changes and benchmark results for performance changes.
- Follow project style (clangformat) and run tests locally before submitting.
## Reporting issues
When opening an issue for incorrect results or performance regressions, please include:
- Platform and CPU/GPU model.
- Compiler and version with exact compile flags.
- Small reproducer (input data and the TrigDx implementation used).
## License
See the LICENSE file in the repository for licensing details.

View File

@@ -1,23 +1,36 @@
FetchContent_Declare(
benchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.9.4)
set(BENCHMARK_ENABLE_TESTING
OFF
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(benchmark)
add_executable(benchmark_reference benchmark_reference.cpp)
target_link_libraries(benchmark_reference PRIVATE trigdx)
target_link_libraries(benchmark_reference PRIVATE trigdx benchmark::benchmark)
add_executable(benchmark_lookup benchmark_lookup.cpp)
target_link_libraries(benchmark_lookup PRIVATE trigdx)
target_link_libraries(benchmark_lookup PRIVATE trigdx benchmark::benchmark)
add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
target_link_libraries(benchmark_lookup_avx PRIVATE trigdx)
if(HAVE_AVX)
add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
target_link_libraries(benchmark_lookup_avx PRIVATE trigdx
benchmark::benchmark)
endif()
if(USE_MKL)
if(TRIGDX_USE_MKL)
add_executable(benchmark_mkl benchmark_mkl.cpp)
target_link_libraries(benchmark_mkl PRIVATE trigdx)
target_link_libraries(benchmark_mkl PRIVATE trigdx benchmark::benchmark)
endif()
if(USE_GPU)
if(TRIGDX_USE_GPU)
add_executable(benchmark_gpu benchmark_gpu.cpp)
target_link_libraries(benchmark_gpu PRIVATE trigdx gpu)
target_link_libraries(benchmark_gpu PRIVATE trigdx gpu benchmark::benchmark)
endif()
if(USE_XSIMD)
if(TRIGDX_USE_XSIMD)
add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp)
target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx)
target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx
benchmark::benchmark)
endif()

View File

@@ -1,9 +1,21 @@
#include <trigdx/gpu.hpp>
#include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp"
int main() {
benchmark_sinf<GPUBackend>();
benchmark_cosf<GPUBackend>();
benchmark_sincosf<GPUBackend>();
}
BENCHMARK_TEMPLATE(benchmark_sinf, GPUBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, GPUBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, GPUBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_MAIN();

View File

@@ -1,13 +1,30 @@
#include <trigdx/lookup.hpp>
#include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp"
int main() {
benchmark_sinf<LookupBackend<16384>>();
benchmark_cosf<LookupBackend<16384>>();
benchmark_sincosf<LookupBackend<16384>>();
benchmark_sinf<LookupBackend<32768>>();
benchmark_cosf<LookupBackend<32768>>();
benchmark_sincosf<LookupBackend<32768>>();
template <typename Backend> void register_benchmarks() {
BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
}
int main(int argc, char **argv) {
::benchmark::Initialize(&argc, argv);
register_benchmarks<LookupBackend<16384>>();
register_benchmarks<LookupBackend<32768>>();
return ::benchmark::RunSpecifiedBenchmarks();
}

View File

@@ -1,13 +1,30 @@
#include <trigdx/lookup_avx.hpp>
#include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp"
int main() {
benchmark_sinf<LookupAVXBackend<16384>>();
benchmark_cosf<LookupAVXBackend<16384>>();
benchmark_sincosf<LookupAVXBackend<16384>>();
benchmark_sinf<LookupAVXBackend<32768>>();
benchmark_cosf<LookupAVXBackend<32768>>();
benchmark_sincosf<LookupAVXBackend<32768>>();
template <typename Backend> void register_benchmarks() {
BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
}
int main(int argc, char **argv) {
::benchmark::Initialize(&argc, argv);
register_benchmarks<LookupAVXBackend<16384>>();
register_benchmarks<LookupAVXBackend<32768>>();
return ::benchmark::RunSpecifiedBenchmarks();
}

View File

@@ -1,13 +1,30 @@
#include <trigdx/lookup_xsimd.hpp>
#include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp"
int main() {
benchmark_sinf<LookupXSIMDBackend<16384>>();
benchmark_cosf<LookupXSIMDBackend<16384>>();
benchmark_sincosf<LookupXSIMDBackend<16384>>();
benchmark_sinf<LookupXSIMDBackend<32768>>();
benchmark_cosf<LookupXSIMDBackend<32768>>();
benchmark_sincosf<LookupXSIMDBackend<32768>>();
template <typename Backend> void register_benchmarks() {
BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
}
int main(int argc, char **argv) {
::benchmark::Initialize(&argc, argv);
register_benchmarks<LookupXSIMDBackend<16384>>();
register_benchmarks<LookupXSIMDBackend<32768>>();
return ::benchmark::RunSpecifiedBenchmarks();
}

View File

@@ -1,9 +1,21 @@
#include <trigdx/mkl.hpp>
#include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp"
int main() {
benchmark_sinf<MKLBackend>();
benchmark_cosf<MKLBackend>();
benchmark_sincosf<MKLBackend>();
}
BENCHMARK_TEMPLATE(benchmark_sinf, MKLBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, MKLBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, MKLBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_MAIN();

View File

@@ -1,9 +1,21 @@
#include <trigdx/reference.hpp>
#include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp"
int main() {
benchmark_sinf<ReferenceBackend>();
benchmark_cosf<ReferenceBackend>();
benchmark_sincosf<ReferenceBackend>();
}
BENCHMARK_TEMPLATE(benchmark_sinf, ReferenceBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, ReferenceBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, ReferenceBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_MAIN();

View File

@@ -1,76 +1,122 @@
#pragma once
#include <chrono>
#include <iomanip>
#include <iostream>
#include <cmath>
#include <stdexcept>
#include <string>
#include <vector>
const size_t N = 1e7;
#include <benchmark/benchmark.h>
inline void report(const std::string &name, double sec, double throughput) {
std::ios state(nullptr);
state.copyfmt(std::cout);
std::cout << std::setw(7) << name << " -> ";
std::cout << "time: ";
std::cout << std::fixed << std::setprecision(3) << std::setfill('0');
std::cout << sec << " s, ";
std::cout << "throughput: " << throughput << " M elems/sec\n";
std::cout.copyfmt(state);
void init_x(float *x, size_t n) {
for (size_t i = 0; i < n; ++i) {
x[i] = (i % 360) * 0.0174533f; // degrees to radians
}
}
template <typename Backend> inline void benchmark_sinf() {
std::vector<float> x(N), s(N);
for (size_t i = 0; i < N; ++i)
x[i] = (i % 360) * 0.0174533f; // degrees to radians
template <typename Backend>
static void benchmark_sinf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0));
Backend backend;
backend.init(N);
auto start = std::chrono::high_resolution_clock::now();
backend.compute_sinf(N, x.data(), s.data());
backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() /
1.e3;
double sec = std::chrono::duration<double>(end - start).count();
double throughput = N / sec / 1e6;
init_x(x, N);
report("sinf", sec, throughput);
for (auto _ : state) {
backend.compute_sinf(N, x, s);
benchmark::DoNotOptimize(s);
}
backend.free_memory(x);
backend.free_memory(s);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N));
}
template <typename Backend> inline void benchmark_cosf() {
std::vector<float> x(N), c(N);
for (size_t i = 0; i < N; ++i)
x[i] = (i % 360) * 0.0174533f; // degrees to radians
template <typename Backend>
static void benchmark_cosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0));
Backend backend;
backend.init(N);
auto start = std::chrono::high_resolution_clock::now();
backend.compute_cosf(N, x.data(), c.data());
backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
if (!x || !c) {
throw std::runtime_error("Buffer allocation failed");
}
auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() /
1.e3;
double sec = std::chrono::duration<double>(end - start).count();
double throughput = N / sec / 1e6;
init_x(x, N);
report("cosf", sec, throughput);
for (auto _ : state) {
backend.compute_cosf(N, x, c);
benchmark::DoNotOptimize(c);
}
backend.free_memory(x);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N));
}
template <typename Backend> inline void benchmark_sincosf() {
std::vector<float> x(N), s(N), c(N);
for (size_t i = 0; i < N; ++i)
x[i] = (i % 360) * 0.0174533f; // degrees to radians
template <typename Backend>
static void benchmark_sincosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0));
Backend backend;
backend.init(N);
auto start = std::chrono::high_resolution_clock::now();
backend.compute_sincosf(N, x.data(), s.data(), c.data());
backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
if (!x || !s || !c) {
throw std::runtime_error("Buffer allocation failed");
}
auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() /
1.e3;
double sec = std::chrono::duration<double>(end - start).count();
double throughput = N / sec / 1e6;
init_x(x, N);
report("sincosf", sec, throughput);
for (auto _ : state) {
backend.compute_sincosf(N, x, s, c);
benchmark::DoNotOptimize(s);
benchmark::DoNotOptimize(c);
}
backend.free_memory(x);
backend.free_memory(s);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N));
}

42
cmake/FindAVX.cmake Normal file
View File

@@ -0,0 +1,42 @@
include(CheckCXXSourceRuns)
set(SUPPORTED_COMPILERS Clang;GNU;Intel;IntelLLVM)
if(CMAKE_CXX_COMPILER_ID IN_LIST SUPPORTED_COMPILERS)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(CMAKE_REQUIRED_FLAGS "-xHost") # ICC
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
set(CMAKE_REQUIRED_FLAGS "-march=native") # ICX
else()
set(CMAKE_REQUIRED_FLAGS "-march=native") # GCC/Clang
endif()
else()
message(FATAL_ERROR "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}.")
endif()
# AVX check
check_cxx_source_runs(
"
#include <immintrin.h>
int main() {
__m256 a = _mm256_setzero_ps(); // AVX
(void) a;
return 0;
}
"
HAVE_AVX)
if(HAVE_AVX)
# AVX2 check
check_cxx_source_runs(
"
#include <immintrin.h>
int main() {
__m256i a = _mm256_set1_epi32(-1);
__m256i b = _mm256_abs_epi32(a); // AVX2
(void) b;
return 0;
}
"
HAVE_AVX2)
endif()

View File

@@ -0,0 +1,5 @@
#pragma once
#cmakedefine TRIGDX_USE_MKL
#cmakedefine TRIGDX_USE_GPU
#cmakedefine TRIGDX_USE_XSIMD

View File

@@ -11,7 +11,8 @@ public:
GPUBackend();
~GPUBackend() override;
void init(size_t n = 0) override;
void *allocate_memory(size_t bytes) const override;
void free_memory(void *ptr) const override;
void compute_sinf(size_t n, const float *x, float *s) const override;
void compute_cosf(size_t n, const float *x, float *c) const override;
void compute_sincosf(size_t n, const float *x, float *s,

View File

@@ -1,6 +1,8 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <cstdlib>
// Base interface for all math backends
class Backend {
@@ -10,6 +12,12 @@ public:
// Optional initialization
virtual void init(size_t n = 0) {}
virtual void *allocate_memory(size_t bytes) const {
return static_cast<void *>(new uint8_t[bytes]);
};
virtual void free_memory(void *ptr) const { std::free(ptr); };
// Compute sine for n elements
virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;

19
include/trigdx/trigdx.hpp Normal file
View File

@@ -0,0 +1,19 @@
#pragma once
#include <trigdx/trigdx_config.hpp>
#include <trigdx/lookup.hpp>
#include <trigdx/lookup_avx.hpp>
#include <trigdx/reference.hpp>
#if defined(TRIGDX_USE_MKL)
#include <trigdx/mkl.hpp>
#endif
#if defined(TRIGDX_USE_GPU)
#include <trigdx/gpu.hpp>
#endif
#if defined(TRIGDX_USE_XSIMD)
#include <trigdx/lookup_xsimd.hpp>
#endif

23
python/CMakeLists.txt Normal file
View File

@@ -0,0 +1,23 @@
find_package(pybind11 CONFIG QUIET)
if(NOT pybind11_FOUND)
FetchContent_Declare(
pybind11
GIT_REPOSITORY https://github.com/pybind/pybind11.git
GIT_TAG v3.0.0)
FetchContent_MakeAvailable(pybind11)
endif()
# Needed to set ${Python_VERSION_MAJOR} and ${Python_VERSION_MINOR}
find_package(Python REQUIRED)
pybind11_add_module(pytrigdx bindings.cpp)
target_link_libraries(pytrigdx PRIVATE trigdx)
set_target_properties(pytrigdx PROPERTIES OUTPUT_NAME "trigdx")
set(PYTHON_SITE_PACKAGES
"${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/trigdx"
)
install(TARGETS pytrigdx DESTINATION ${PYTHON_SITE_PACKAGES})
install(FILES __init__.py DESTINATION ${PYTHON_SITE_PACKAGES})

16
python/__init__.py Normal file
View File

@@ -0,0 +1,16 @@
from .trigdx import Reference, Lookup16K, Lookup32K, LookupAVX16K, LookupAVX32K
try:
from .trigdx import MKL
except ImportError:
pass
try:
from .trigdx import GPU
except ImportError:
pass
try:
from .trigdx import LookupXSIMD16K, LookupXSIMD32K
except ImportError:
pass

96
python/bindings.cpp Normal file
View File

@@ -0,0 +1,96 @@
#include <memory>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <trigdx/trigdx.hpp>
namespace py = pybind11;
template <typename T>
py::array_t<T>
compute_sin(const Backend &backend,
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
const size_t n = x.shape(0);
if (x.ndim() != 1) {
throw py::value_error("Input array must be 1-dimensional");
}
const T *x_ptr = x.data();
py::array_t<float> s(n);
T *s_ptr = s.mutable_data();
backend.compute_sinf(n, x_ptr, s_ptr);
return s;
}
template <typename T>
py::array_t<T>
compute_cos(const Backend &backend,
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
const size_t n = x.shape(0);
if (x.ndim() != 1) {
throw py::value_error("Input array must be 1-dimensional");
}
const T *x_ptr = x.data();
py::array_t<T> c(n);
T *c_ptr = c.mutable_data();
backend.compute_cosf(n, x_ptr, c_ptr);
return c;
}
template <typename T>
std::tuple<py::array_t<T>, py::array_t<T>>
compute_sincos(const Backend &backend,
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
const size_t n = x.shape(0);
if (x.ndim() != 1) {
throw py::value_error("Input array must be 1-dimensional");
}
const T *x_ptr = x.data();
py::array_t<T> s(n);
py::array_t<T> c(n);
backend.compute_sincosf(n, x_ptr, s.mutable_data(), c.mutable_data());
return std::make_tuple(s, c);
}
template <typename BackendType>
void bind_backend(py::module &m, const char *name) {
py::class_<BackendType, Backend, std::shared_ptr<BackendType>>(m, name)
.def(py::init<>())
.def("compute_sinf", &compute_sin<float>)
.def("compute_cosf", &compute_cos<float>)
.def("compute_sincosf", &compute_sincos<float>);
}
PYBIND11_MODULE(trigdx, m) {
m.doc() = "TrigDx python bindings";
py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend")
.def("init", &Backend::init);
bind_backend<ReferenceBackend>(m, "Reference");
bind_backend<LookupBackend<16384>>(m, "Lookup16K");
bind_backend<LookupBackend<32768>>(m, "Lookup32K");
bind_backend<LookupAVXBackend<16384>>(m, "LookupAVX16K");
bind_backend<LookupAVXBackend<32768>>(m, "LookupAVX32K");
#if defined(TRIGDX_USE_MKL)
bind_backend<MKLBackend>(m, "MKL");
#endif
#if defined(TRIGDX_USE_GPU)
bind_backend<GPUBackend>(m, "GPU");
#endif
#if defined(TRIGDX_USE_XSIMD)
bind_backend<LookupXSIMDBackend<16384>>(m, "LookupXSIMD16K");
bind_backend<LookupXSIMDBackend<32768>>(m, "LookupXSIMD32K");
#endif
}

View File

@@ -1,16 +1,20 @@
add_library(trigdx reference.cpp lookup.cpp lookup_avx.cpp)
include(FetchContent)
include(FindAVX)
add_library(trigdx reference.cpp lookup.cpp)
target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
target_compile_options(trigdx PRIVATE -O3 -march=native)
if(HAVE_AVX)
target_sources(trigdx PRIVATE lookup_avx.cpp)
endif()
if(USE_MKL)
if(TRIGDX_USE_MKL)
find_package(MKL REQUIRED)
target_sources(trigdx PRIVATE mkl.cpp)
target_link_libraries(trigdx PRIVATE MKL::MKL)
endif()
if(USE_GPU)
if(TRIGDX_USE_GPU)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
add_library(gpu SHARED gpu/gpu.cu)
@@ -19,8 +23,20 @@ if(USE_GPU)
target_link_libraries(trigdx PRIVATE gpu)
endif()
if(USE_XSIMD)
find_package(xsimd REQUIRED)
if(TRIGDX_USE_XSIMD)
# Requires XSIMD > 13 for architecture independent dispatching
find_package(xsimd 13 QUIET)
if(NOT TARGET xsimd)
FetchContent_Declare(
xsimd
GIT_REPOSITORY https://github.com/xtensor-stack/xsimd.git
GIT_TAG 13.2.0)
FetchContent_MakeAvailable(xsimd)
endif()
target_sources(trigdx PRIVATE lookup_xsimd.cpp)
target_link_libraries(trigdx PRIVATE xsimd)
endif()
target_include_directories(
trigdx INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>)

View File

@@ -10,75 +10,63 @@
struct GPUBackend::Impl {
~Impl() {
if (h_x) {
cudaFreeHost(h_x);
}
if (h_s) {
cudaFreeHost(h_s);
}
if (h_c) {
cudaFreeHost(h_c);
}
if (d_x) {
cudaFree(d_x);
}
if (d_s) {
cudaFree(d_s);
}
if (d_c) {
cudaFree(d_c);
}
void *allocate_memory(size_t bytes) const {
void *ptr;
cudaMallocHost(&ptr, bytes);
return ptr;
}
void init(size_t n) {
const size_t bytes = n * sizeof(float);
cudaMallocHost(&h_x, bytes);
cudaMallocHost(&h_s, bytes);
cudaMallocHost(&h_c, bytes);
cudaMalloc(&d_x, bytes);
cudaMalloc(&d_s, bytes);
cudaMalloc(&d_c, bytes);
}
void free_memory(void *ptr) const { cudaFreeHost(ptr); }
void compute_sinf(size_t n, const float *x, float *s) const {
const size_t bytes = n * sizeof(float);
std::memcpy(h_x, x, bytes);
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
launch_sincosf_kernel(d_x, d_s, d_c, n);
cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
std::memcpy(s, h_s, bytes);
float *d_x, *d_s;
cudaMalloc(&d_x, bytes);
cudaMalloc(&d_s, bytes);
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_sinf_kernel(d_x, d_s, n);
cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_s);
}
void compute_cosf(size_t n, const float *x, float *c) const {
const size_t bytes = n * sizeof(float);
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
launch_sincosf_kernel(d_x, d_s, d_c, n);
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
std::memcpy(c, h_c, bytes);
float *d_x, *d_c;
cudaMalloc(&d_x, bytes);
cudaMalloc(&d_c, bytes);
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_cosf_kernel(d_x, d_c, n);
cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_c);
}
void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
const size_t bytes = n * sizeof(float);
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
float *d_x, *d_s, *d_c;
cudaMalloc(&d_x, bytes);
cudaMalloc(&d_s, bytes);
cudaMalloc(&d_c, bytes);
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_sincosf_kernel(d_x, d_s, d_c, n);
cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_s);
cudaFree(d_c);
}
float *h_x = nullptr;
float *h_s = nullptr;
float *h_c = nullptr;
float *d_x = nullptr;
float *d_s = nullptr;
float *d_c = nullptr;
};
GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
GPUBackend::~GPUBackend() = default;
void GPUBackend::init(size_t n) { impl->init(n); }
void *GPUBackend::allocate_memory(size_t bytes) const {
return impl->allocate_memory(bytes);
}
void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }
void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
impl->compute_sinf(n, x, s);

View File

@@ -160,7 +160,6 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
for (std::size_t i = 0; i < n; ++i) {
std::size_t idx = static_cast<std::size_t>(x[i] * SCALE) & MASK;
std::size_t idx_cos = (idx + NR_SAMPLES / 4) & MASK;
s[i] = lookup[idx];
c[i] = lookup[idx_cos];
}
#endif

View File

@@ -8,12 +8,20 @@
template <std::size_t NR_SAMPLES> struct lookup_table {
static constexpr std::size_t MASK = NR_SAMPLES - 1;
static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI));
lookup_table() : values{} {
static constexpr float PI_FRAC = 2.0f * M_PIf32 / NR_SAMPLES;
static constexpr float TERM1 = 1.0f; // 1
static constexpr float TERM2 = 0.5f; // 1/2!
static constexpr float TERM3 = 1.0f / 6.0f; // 1/3!
static constexpr float TERM4 = 1.0f / 24.0f; // 1/4!
constexpr lookup_table() : sin_values{}, cos_values{} {
for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) {
values[i] = sinf(i * (2.0f * float(M_PI) / NR_SAMPLES));
sin_values[i] = sinf(i * PI_FRAC);
cos_values[i] = cosf(i * PI_FRAC);
}
}
std::array<float, NR_SAMPLES> values;
std::array<float, NR_SAMPLES> sin_values;
std::array<float, NR_SAMPLES> cos_values;
};
template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
@@ -25,28 +33,56 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK);
const m_type quarter_pi = m_type::broadcast(Q_PI);
const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
uint_fast32_t i;
for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag());
const b_type scaled = xsimd::mul(vx, scale);
m_type idx = xsimd::to_int(scaled);
m_type idx_cos = xsimd::add(idx, quarter_pi);
idx_cos = xsimd::bitwise_and(idx_cos, mask);
const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
const b_type f_idx = xsimd::to_float(idx);
idx = xsimd::bitwise_and(idx, mask);
b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3);
cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
cosv.store(c + i, Tag());
}
for (; i < n; i++) {
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
lookup_table_.MASK;
std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
c[i] = lookup_table_.values[idx_cos];
std::size_t masked = idx & lookup_table_.MASK;
const float cosv = lookup_table_.cos_values[masked];
const float sinv = lookup_table_.sin_values[masked];
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx;
const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx;
const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3;
c[i] = cosv * cosdx - sinv * sindx;
}
}
lookup_table<NR_SAMPLES> lookup_table_;
@@ -61,25 +97,53 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK);
const m_type quarter_pi = m_type::broadcast(Q_PI);
const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
uint_fast32_t i;
for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag());
const b_type scaled = xsimd::mul(vx, scale);
m_type idx = xsimd::to_int(scaled);
idx = xsimd::bitwise_and(idx, mask);
const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
b_type f_idx = xsimd::to_float(idx);
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3);
idx = xsimd::bitwise_and(idx, mask);
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
const b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
sinv.store(s + i, Tag());
}
for (; i < n; i++) {
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
lookup_table_.MASK;
s[i] = lookup_table_.values[idx];
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
std::size_t masked = idx & lookup_table_.MASK;
const float cosv = lookup_table_.cos_values[masked];
const float sinv = lookup_table_.sin_values[masked];
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx;
const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx;
const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3;
s[i] = sinv * cosdx + cosv * sindx;
}
}
lookup_table<NR_SAMPLES> lookup_table_;
@@ -94,31 +158,56 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const m_type mask = m_type::broadcast(lookup_table_.MASK);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i;
for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag());
const b_type scaled = xsimd::mul(vx, scale);
m_type idx = xsimd::to_int(scaled);
m_type idx_cos = xsimd::add(idx, quarter_pi);
b_type f_idx = xsimd::to_float(idx);
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4);
idx = xsimd::bitwise_and(idx, mask);
idx_cos = xsimd::bitwise_and(idx_cos, mask);
const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3);
sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
sinv.store(s + i, Tag());
cosv.store(c + i, Tag());
}
for (; i < n; i++) {
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
lookup_table_.MASK;
std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
s[i] = lookup_table_.values[idx];
c[i] = lookup_table_.values[idx_cos];
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
std::size_t masked = idx & lookup_table_.MASK;
const float cosv = lookup_table_.cos_values[masked];
const float sinv = lookup_table_.sin_values[masked];
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx;
const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx;
const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3;
s[i] = sinv * cosdx + cosv * sindx;
c[i] = cosv * cosdx - sinv * sindx;
}
}
lookup_table<NR_SAMPLES> lookup_table_;

View File

@@ -1,5 +1,3 @@
include(FetchContent)
FetchContent_Declare(
catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
@@ -9,31 +7,31 @@ FetchContent_MakeAvailable(catch2)
# Lookup backend test
add_executable(test_lookup test_lookup.cpp)
target_link_libraries(test_lookup PRIVATE trigdx Catch2::Catch2WithMain)
# LookupAVX backend test
add_executable(test_lookup_avx test_lookup_avx.cpp)
target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
# MKL backend test
if(USE_MKL)
add_executable(test_mkl test_mkl.cpp)
target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
endif()
include(CTest)
add_test(NAME test_lookup COMMAND test_lookup)
if(USE_MKL)
# LookupAVX backend test
if(HAVE_AVX)
add_executable(test_lookup_avx test_lookup_avx.cpp)
target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_lookup_avx COMMAND test_lookup_avx)
endif()
# MKL backend test
if(TRIGDX_USE_MKL)
add_executable(test_mkl test_mkl.cpp)
target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_mkl COMMAND test_mkl)
endif()
if(USE_GPU)
# GPU backend test
if(TRIGDX_USE_GPU)
add_executable(test_gpu test_gpu.cpp)
target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_gpu COMMAND test_gpu)
endif()
if(USE_XSIMD)
# XSIMD backend test
if(TRIGDX_USE_XSIMD)
add_executable(test_lookup_xsimd test_lookup_xsimd.cpp)
target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_lookup_xsimd COMMAND test_lookup_xsimd)

View File

@@ -3,8 +3,8 @@
#include "test_utils.hpp"
TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-1f); }
TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-6f); }
TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-1f); }
TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-6f); }
TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-1f); }
TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-6f); }

View File

@@ -4,16 +4,16 @@
#include "test_utils.hpp"
TEST_CASE("sincosf") {
test_sincosf<LookupXSIMDBackend<16384>>(1e-2f);
test_sincosf<LookupXSIMDBackend<32768>>(1e-2f);
test_sincosf<LookupXSIMDBackend<16384>>(1e-6f);
test_sincosf<LookupXSIMDBackend<32768>>(1e-6f);
}
TEST_CASE("sinf") {
test_sinf<LookupXSIMDBackend<16384>>(1e-2f);
test_sinf<LookupXSIMDBackend<32768>>(1e-2f);
test_sinf<LookupXSIMDBackend<16384>>(1e-6f);
test_sinf<LookupXSIMDBackend<32768>>(1e-6f);
}
TEST_CASE("cosf") {
test_cosf<LookupXSIMDBackend<16384>>(1e-2f);
test_cosf<LookupXSIMDBackend<32768>>(1e-2f);
test_cosf<LookupXSIMDBackend<16384>>(1e-6f);
test_cosf<LookupXSIMDBackend<32768>>(1e-6f);
}

View File

@@ -7,14 +7,17 @@
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <trigdx/reference.hpp>
const size_t N = 1e7;
const size_t N = 1234;
void init_x(std::vector<float> &x) {
for (size_t i = 0; i < x.size(); ++i) {
x[i] = (i % 360) * 0.0174533f; // degrees to radians
}
}
template <typename Backend> inline void test_sinf(float tol) {
std::vector<float> x(N), s_ref(N), s(N);
for (size_t i = 0; i < N; ++i) {
x[i] = float(i) * 0.01f;
}
init_x(x);
ReferenceBackend ref;
Backend backend;
@@ -30,10 +33,7 @@ template <typename Backend> inline void test_sinf(float tol) {
template <typename Backend> inline void test_cosf(float tol) {
std::vector<float> x(N), c_ref(N), c(N);
for (size_t i = 0; i < N; ++i) {
x[i] = float(i) * 0.01f;
}
init_x(x);
ReferenceBackend ref;
Backend backend;
@@ -49,10 +49,7 @@ template <typename Backend> inline void test_cosf(float tol) {
template <typename Backend> inline void test_sincosf(float tol) {
std::vector<float> x(N), s_ref(N), c_ref(N), s(N), c(N);
for (size_t i = 0; i < N; ++i) {
x[i] = float(i) * 0.01f;
}
init_x(x);
ReferenceBackend ref;
Backend backend;