43 Commits

Author SHA1 Message Date
Wiebe van Breukelen
38664f6acb Fix compiler warnings 2025-10-22 15:20:50 +02:00
mmancini-skao
76998a137a Fix error in Taylor expansion (#36)
Replaced term3 with term4 in calculations for t4.
2025-10-20 17:09:35 +02:00
Bram Veenboer
500d35070e Fix formatting (#35)
* Run pre-commit

* Skip line-length check in cmake-lint
2025-10-10 09:19:18 +02:00
Dantali0n
bfe752433f Fixes #30, Add CMake steps to install python bindings (#31) 2025-09-17 20:03:28 +02:00
Bram Veenboer
8fe8314905 Update GPU backend (#29)
* Update GPU memory management
* Add allocate_memory and free_memory
2025-09-03 09:16:28 +02:00
Wiebe van Breukelen
9d3af8c202 Fixed broken pybind11 target check (#28)
* Fixed broken pybind11 target check

* Update python/CMakeLists.txt

Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com>

---------

Co-authored-by: Bram Veenboer <bram.veenboer@gmail.com>
2025-08-27 17:07:30 +02:00
Bram Veenboer
0f7fd06be8 Extend CI (#27)
* Add build with Intel compiler
* Switch to upload/download-artifacts that retain permissions
2025-08-21 15:15:25 +02:00
Bram Veenboer
112baf447b Update the test workflow (#26)
- Upgrade to v1.2 of slurm-action
- Add extra element to matrix to test this new action
- Add cleanup step
- Simplify handling of artefacts
- Execute tests directly (not with ctest)
2025-08-21 09:06:42 +02:00
Wiebe van Breukelen
a5ba99ff5f Only fetch pybind11 when target not available (#25) 2025-08-15 16:19:34 +02:00
Wiebe van Breukelen
77c55d6824 Change CMAKE_SOURCE_DIR to PROJECT_SOURCE_DIR (#24) 2025-08-15 15:30:37 +02:00
Bram Veenboer
c85df5f69c Add Taylor expansion to LookupXSIMD (#23)
Co-authored-by: Wiebe van Breukelen <breukelen@astron.nl>
Co-authored-by: mancini <mancini@astron.nl>
2025-08-15 10:53:15 +02:00
Bram Veenboer
9c17e90c77 Add AVX checks (#20)
Co-authored-by: mancini <mancini@astron.nl>
2025-08-15 10:30:57 +02:00
Bram Veenboer
e755c1a454 Fix CI (#21) 2025-08-15 10:29:48 +02:00
Bram Veenboer
b129d9ffaf Update CI (#19) 2025-08-15 10:12:55 +02:00
Bram Veenboer
428f60f6d6 Delete .vscode/settings.json (#18) 2025-08-14 17:16:02 +02:00
Bram Veenboer
0e8ea57025 Add initial CI (#17) 2025-08-14 17:14:29 +02:00
Bram Veenboer
a072ffd12f Merge pull request #10 from astron-rd/add-python-interface
Add Python interface
2025-08-14 15:21:13 +02:00
Bram Veenboer
b3467840f9 Use const size_t for n 2025-08-14 11:07:45 +02:00
Bram Veenboer
79dc7b4285 Add dimension checks 2025-08-14 11:05:31 +02:00
Bram Veenboer
97692cface Use Type template for helper functions 2025-08-14 11:02:30 +02:00
Bram Veenboer
f40c44d5dd Add Python interface 2025-08-14 10:57:06 +02:00
Wiebe van Breukelen
0cbefb77b7 Fix CMake interface include paths (#14)
* Fixed include path

* Formatting

* Remove lookup_constexpr.hpp
2025-08-14 09:34:27 +02:00
Bram Veenboer
cdc94ab2cb Merge pull request #11 from astron-rd/fix-cuda-test
Bugfixes in data copies in CUDA backend
2025-08-12 17:16:10 +02:00
Bram Veenboer
33f98abc48 Set tolerance for CUDA tests to 1e-6 2025-08-12 17:14:41 +02:00
Bram Veenboer
c7ab463b43 Bugfix gpu output data copy 2025-08-12 17:14:41 +02:00
Bram Veenboer
ebb6d50c0b Bugfix gpu input data copy 2025-08-12 17:14:41 +02:00
Bram Veenboer
5338f3e135 Cleanup data initialization of tests 2025-08-12 17:14:41 +02:00
Bram Veenboer
f6575599fd Cleanup data initialization of benchmarks 2025-08-12 17:14:41 +02:00
Bram Veenboer
cd048e5581 Remove unused DEFAULT_N 2025-08-12 17:14:41 +02:00
Bram Veenboer
eb3806442d Merge pull request #9 from astron-rd/cmake-bugfix
Fix issue when building with TRIGDX_USE_GPU=1
2025-08-12 15:28:47 +02:00
Bram Veenboer
e58d6dae8d Merge pull request #7 from astron-rd/use-google-benchmark
Refactor benchmarks using Google Benchmark
2025-08-12 15:24:58 +02:00
Bram Veenboer
a65137322d Use static_cast 2025-08-12 15:18:07 +02:00
Bram Veenboer
b936b3998e Fix issue when building with TRIGDX_USE_GPU=1 2025-08-12 14:59:36 +02:00
Bram Veenboer
0e2d9862d5 Merge pull request #8 from astron-rd/use-common-headers
Add trigdx_config.hpp and trigdx.hpp header files
2025-08-12 14:50:19 +02:00
Bram Veenboer
2f39c5c86e Add trigdx_config.hpp and trigdx.hpp header files 2025-08-12 14:04:13 +02:00
Bram Veenboer
5e7aca89bb Refactor benchmarks using Google Benchmark 2025-08-12 13:17:23 +02:00
Bram Veenboer
e9a74ef283 Merge pull request #6 from astron-rd/fix-tests-cmake
Improve the CMake for the tests
2025-08-12 11:38:25 +02:00
Bram Veenboer
832da6229d Improve the CMake 2025-08-12 11:28:43 +02:00
Wiebe van Breukelen
b71611ed17 Merge pull request #5 from astron-rd/auto-pull-xsimd
Pull xsimd when not available
2025-08-12 10:29:35 +02:00
Wiebe van Breukelen
5a4e80ea4a Pull xsimd when not available 2025-08-12 10:18:32 +02:00
Wiebe van Breukelen
13f18847bc Merge pull request #3 from astron-rd/add-gitignore
Add .gitignore
2025-08-12 10:11:04 +02:00
Wiebe van Breukelen
eee382307f Merge pull request #2 from astron-rd/add-cmake-option-prefix
Add TRIGDX prefix to CMake configurable options
2025-08-12 10:10:55 +02:00
Wiebe van Breukelen
d04fb8933d Add TRIGDX prefix to CMake configurable options 2025-08-12 09:52:28 +02:00
29 changed files with 808 additions and 305 deletions

15
.github/workflows/linting.yml vendored Normal file
View File

@@ -0,0 +1,15 @@
name: linting
on:
push:
jobs:
format:
name: Linting
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- run: sudo apt-get update
- run: sudo apt-get install -y clang-format-14 cmake-format pre-commit
- run: pre-commit run -a

89
.github/workflows/test.yml vendored Normal file
View File

@@ -0,0 +1,89 @@
name: test
on:
push:
jobs:
build:
runs-on: [slurm]
strategy:
matrix: &matrix
include:
- name: GNU, NVIDIA A4000
partition: defq
gres: gpu:A4000
cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
environment_modules: "spack/20250403 intel-oneapi-mkl cuda python"
- name: Intel, NVIDIA A4000
partition: defq
gres: gpu:A4000
cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
environment_modules: "spack/20250403 intel-oneapi-compilers intel-oneapi-mkl cuda python"
- name: NVIDIA GH200
partition: ghq
gres: gpu:GH200
cmake_flags: "-DTRIGDX_USE_GPU=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=90a -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc"
env:
DEVICE_NAME: ${{ matrix.name }}
PARTITION_NAME: ${{ matrix.partition }}
CMAKE_FLAGS: ${{ matrix.cmake_flags }}
ENVIRONMENT_MODULES: ${{ matrix.environment_modules }}
steps:
- &cleanup
name: "Remove old workspace"
run: |
rm -rf $GITHUB_WORKSPACE/*
mkdir -p $GITHUB_WORKSPACE
- uses: actions/checkout@v4
- uses: astron-rd/slurm-action@v1.2
with:
partition: ${{ matrix.partition }}
gres: ${{ matrix.gres }}
commands: |
module load ${ENVIRONMENT_MODULES}
cmake -S . -B build ${CMAKE_FLAGS}
make -C build -j
- name: Upload build
uses: pyTooling/upload-artifact@v4
with:
name: build-${{ matrix.name }}
path: build
test:
runs-on: [slurm]
needs: build
strategy:
matrix: *matrix
env:
DEVICE_NAME: ${{ matrix.name }}
PARTITION_NAME: ${{ matrix.partition }}
steps:
- *cleanup
- uses: pyTooling/download-artifact@v4
with:
name: build-${{ matrix.name }}
- uses: astron-rd/slurm-action@v1.2
with:
partition: ${{ matrix.partition }}
gres: ${{ matrix.gres }}
commands: |
find build/tests -type f -executable -exec {} \;
benchmark:
runs-on: [slurm]
needs: build
strategy:
matrix: *matrix
env:
DEVICE_NAME: ${{ matrix.name }}
PARTITION_NAME: ${{ matrix.partition }}
steps:
- *cleanup
- uses: pyTooling/download-artifact@v4
with:
name: build-${{ matrix.name }}
- uses: astron-rd/slurm-action@v1.2
with:
partition: ${{ matrix.partition }}
gres: ${{ matrix.gres }}
commands: |
find build/benchmarks -type f -executable -exec {} \;

View File

@@ -8,3 +8,4 @@ repos:
hooks: hooks:
- id: cmake-format - id: cmake-format
- id: cmake-lint - id: cmake-lint
args: [--disabled-codes=C0301]

77
.vscode/settings.json vendored
View File

@@ -1,77 +0,0 @@
{
"files.associations": {
"*.cc": "cpp",
"*.tpp": "cpp",
"array": "cpp",
"atomic": "cpp",
"bit": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"charconv": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"concepts": "cpp",
"condition_variable": "cpp",
"csignal": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"set": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"vector": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"random": "cpp",
"ratio": "cpp",
"regex": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"format": "cpp",
"fstream": "cpp",
"future": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"ranges": "cpp",
"semaphore": "cpp",
"span": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"text_encoding": "cpp",
"thread": "cpp",
"typeinfo": "cpp",
"variant": "cpp",
"queue": "cpp",
"stack": "cpp"
}
}

View File

@@ -3,13 +3,45 @@ project(trigdx LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
option(USE_MKL "Enable Intel MKL backend" OFF) option(TRIGDX_USE_MKL "Enable Intel MKL backend" OFF)
option(USE_GPU "Enable GPU backend" OFF) option(TRIGDX_USE_GPU "Enable GPU backend" OFF)
option(USE_XSIMD "Enable XSIMD backend" OFF) option(TRIGDX_USE_XSIMD "Enable XSIMD backend" OFF)
option(TRIGDX_BUILD_TESTS "Build tests" ON)
option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
# Add compiler flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in
${CMAKE_CURRENT_BINARY_DIR}/include/trigdx/trigdx_config.hpp @ONLY)
if(TRIGDX_BUILD_TESTS
OR TRIGDX_BUILD_BENCHMARKS
OR TRIGDX_BUILD_PYTHON)
include(FetchContent)
endif()
include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/include)
add_subdirectory(src) add_subdirectory(src)
add_subdirectory(tests)
add_subdirectory(benchmarks) if(TRIGDX_BUILD_TESTS)
include(CTest)
enable_testing()
add_subdirectory(tests)
endif()
if(TRIGDX_BUILD_BENCHMARKS)
add_subdirectory(benchmarks)
endif()
if(TRIGDX_BUILD_PYTHON)
add_subdirectory(python)
endif()

View File

@@ -1,23 +1,36 @@
FetchContent_Declare(
benchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.9.4)
set(BENCHMARK_ENABLE_TESTING
OFF
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(benchmark)
add_executable(benchmark_reference benchmark_reference.cpp) add_executable(benchmark_reference benchmark_reference.cpp)
target_link_libraries(benchmark_reference PRIVATE trigdx) target_link_libraries(benchmark_reference PRIVATE trigdx benchmark::benchmark)
add_executable(benchmark_lookup benchmark_lookup.cpp) add_executable(benchmark_lookup benchmark_lookup.cpp)
target_link_libraries(benchmark_lookup PRIVATE trigdx) target_link_libraries(benchmark_lookup PRIVATE trigdx benchmark::benchmark)
add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp) if(HAVE_AVX)
target_link_libraries(benchmark_lookup_avx PRIVATE trigdx) add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
target_link_libraries(benchmark_lookup_avx PRIVATE trigdx
benchmark::benchmark)
endif()
if(USE_MKL) if(TRIGDX_USE_MKL)
add_executable(benchmark_mkl benchmark_mkl.cpp) add_executable(benchmark_mkl benchmark_mkl.cpp)
target_link_libraries(benchmark_mkl PRIVATE trigdx) target_link_libraries(benchmark_mkl PRIVATE trigdx benchmark::benchmark)
endif() endif()
if(USE_GPU) if(TRIGDX_USE_GPU)
add_executable(benchmark_gpu benchmark_gpu.cpp) add_executable(benchmark_gpu benchmark_gpu.cpp)
target_link_libraries(benchmark_gpu PRIVATE trigdx gpu) target_link_libraries(benchmark_gpu PRIVATE trigdx gpu benchmark::benchmark)
endif() endif()
if(USE_XSIMD) if(TRIGDX_USE_XSIMD)
add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp) add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp)
target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx) target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx
benchmark::benchmark)
endif() endif()

View File

@@ -1,9 +1,21 @@
#include <trigdx/gpu.hpp> #include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp" #include "benchmark_utils.hpp"
int main() { BENCHMARK_TEMPLATE(benchmark_sinf, GPUBackend)
benchmark_sinf<GPUBackend>(); ->Unit(benchmark::kMillisecond)
benchmark_cosf<GPUBackend>(); ->Arg(1e5)
benchmark_sincosf<GPUBackend>(); ->Arg(1e6)
} ->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, GPUBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, GPUBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_MAIN();

View File

@@ -1,13 +1,30 @@
#include <trigdx/lookup.hpp> #include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp" #include "benchmark_utils.hpp"
int main() { template <typename Backend> void register_benchmarks() {
benchmark_sinf<LookupBackend<16384>>(); BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
benchmark_cosf<LookupBackend<16384>>(); ->Unit(benchmark::kMillisecond)
benchmark_sincosf<LookupBackend<16384>>(); ->Arg(1e5)
->Arg(1e6)
benchmark_sinf<LookupBackend<32768>>(); ->Arg(1e7);
benchmark_cosf<LookupBackend<32768>>(); BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
benchmark_sincosf<LookupBackend<32768>>(); ->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
}
int main(int argc, char **argv) {
::benchmark::Initialize(&argc, argv);
register_benchmarks<LookupBackend<16384>>();
register_benchmarks<LookupBackend<32768>>();
return ::benchmark::RunSpecifiedBenchmarks();
} }

View File

@@ -1,13 +1,30 @@
#include <trigdx/lookup_avx.hpp> #include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp" #include "benchmark_utils.hpp"
int main() { template <typename Backend> void register_benchmarks() {
benchmark_sinf<LookupAVXBackend<16384>>(); BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
benchmark_cosf<LookupAVXBackend<16384>>(); ->Unit(benchmark::kMillisecond)
benchmark_sincosf<LookupAVXBackend<16384>>(); ->Arg(1e5)
->Arg(1e6)
benchmark_sinf<LookupAVXBackend<32768>>(); ->Arg(1e7);
benchmark_cosf<LookupAVXBackend<32768>>(); BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
benchmark_sincosf<LookupAVXBackend<32768>>(); ->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
}
int main(int argc, char **argv) {
::benchmark::Initialize(&argc, argv);
register_benchmarks<LookupAVXBackend<16384>>();
register_benchmarks<LookupAVXBackend<32768>>();
return ::benchmark::RunSpecifiedBenchmarks();
} }

View File

@@ -1,13 +1,30 @@
#include <trigdx/lookup_xsimd.hpp> #include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp" #include "benchmark_utils.hpp"
int main() { template <typename Backend> void register_benchmarks() {
benchmark_sinf<LookupXSIMDBackend<16384>>(); BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
benchmark_cosf<LookupXSIMDBackend<16384>>(); ->Unit(benchmark::kMillisecond)
benchmark_sincosf<LookupXSIMDBackend<16384>>(); ->Arg(1e5)
->Arg(1e6)
benchmark_sinf<LookupXSIMDBackend<32768>>(); ->Arg(1e7);
benchmark_cosf<LookupXSIMDBackend<32768>>(); BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
benchmark_sincosf<LookupXSIMDBackend<32768>>(); ->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
}
int main(int argc, char **argv) {
::benchmark::Initialize(&argc, argv);
register_benchmarks<LookupXSIMDBackend<16384>>();
register_benchmarks<LookupXSIMDBackend<32768>>();
return ::benchmark::RunSpecifiedBenchmarks();
} }

View File

@@ -1,9 +1,21 @@
#include <trigdx/mkl.hpp> #include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp" #include "benchmark_utils.hpp"
int main() { BENCHMARK_TEMPLATE(benchmark_sinf, MKLBackend)
benchmark_sinf<MKLBackend>(); ->Unit(benchmark::kMillisecond)
benchmark_cosf<MKLBackend>(); ->Arg(1e5)
benchmark_sincosf<MKLBackend>(); ->Arg(1e6)
} ->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, MKLBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, MKLBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_MAIN();

View File

@@ -1,9 +1,21 @@
#include <trigdx/reference.hpp> #include <trigdx/trigdx.hpp>
#include "benchmark_utils.hpp" #include "benchmark_utils.hpp"
int main() { BENCHMARK_TEMPLATE(benchmark_sinf, ReferenceBackend)
benchmark_sinf<ReferenceBackend>(); ->Unit(benchmark::kMillisecond)
benchmark_cosf<ReferenceBackend>(); ->Arg(1e5)
benchmark_sincosf<ReferenceBackend>(); ->Arg(1e6)
} ->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_cosf, ReferenceBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_TEMPLATE(benchmark_sincosf, ReferenceBackend)
->Unit(benchmark::kMillisecond)
->Arg(1e5)
->Arg(1e6)
->Arg(1e7);
BENCHMARK_MAIN();

View File

@@ -1,76 +1,122 @@
#pragma once #pragma once
#include <chrono> #include <chrono>
#include <iomanip> #include <cmath>
#include <iostream> #include <stdexcept>
#include <string>
#include <vector> #include <vector>
const size_t N = 1e7; #include <benchmark/benchmark.h>
inline void report(const std::string &name, double sec, double throughput) { void init_x(float *x, size_t n) {
std::ios state(nullptr); for (size_t i = 0; i < n; ++i) {
state.copyfmt(std::cout); x[i] = (i % 360) * 0.0174533f; // degrees to radians
std::cout << std::setw(7) << name << " -> "; }
std::cout << "time: ";
std::cout << std::fixed << std::setprecision(3) << std::setfill('0');
std::cout << sec << " s, ";
std::cout << "throughput: " << throughput << " M elems/sec\n";
std::cout.copyfmt(state);
} }
template <typename Backend> inline void benchmark_sinf() { template <typename Backend>
std::vector<float> x(N), s(N); static void benchmark_sinf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0));
for (size_t i = 0; i < N; ++i)
x[i] = (i % 360) * 0.0174533f; // degrees to radians
Backend backend; Backend backend;
backend.init(N);
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.compute_sinf(N, x.data(), s.data()); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() /
1.e3;
double sec = std::chrono::duration<double>(end - start).count(); init_x(x, N);
double throughput = N / sec / 1e6;
report("sinf", sec, throughput); for (auto _ : state) {
backend.compute_sinf(N, x, s);
benchmark::DoNotOptimize(s);
}
backend.free_memory(x);
backend.free_memory(s);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N));
} }
template <typename Backend> inline void benchmark_cosf() { template <typename Backend>
std::vector<float> x(N), c(N); static void benchmark_cosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0));
for (size_t i = 0; i < N; ++i)
x[i] = (i % 360) * 0.0174533f; // degrees to radians
Backend backend; Backend backend;
backend.init(N);
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.compute_cosf(N, x.data(), c.data()); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
if (!x || !c) {
throw std::runtime_error("Buffer allocation failed");
}
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() /
1.e3;
double sec = std::chrono::duration<double>(end - start).count(); init_x(x, N);
double throughput = N / sec / 1e6;
report("cosf", sec, throughput); for (auto _ : state) {
backend.compute_cosf(N, x, c);
benchmark::DoNotOptimize(c);
}
backend.free_memory(x);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N));
} }
template <typename Backend> inline void benchmark_sincosf() { template <typename Backend>
std::vector<float> x(N), s(N), c(N); static void benchmark_sincosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0));
for (size_t i = 0; i < N; ++i)
x[i] = (i % 360) * 0.0174533f; // degrees to radians
Backend backend; Backend backend;
backend.init(N);
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.compute_sincosf(N, x.data(), s.data(), c.data()); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
if (!x || !s || !c) {
throw std::runtime_error("Buffer allocation failed");
}
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() /
1.e3;
double sec = std::chrono::duration<double>(end - start).count(); init_x(x, N);
double throughput = N / sec / 1e6;
report("sincosf", sec, throughput); for (auto _ : state) {
backend.compute_sincosf(N, x, s, c);
benchmark::DoNotOptimize(s);
benchmark::DoNotOptimize(c);
}
backend.free_memory(x);
backend.free_memory(s);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N));
} }

42
cmake/FindAVX.cmake Normal file
View File

@@ -0,0 +1,42 @@
include(CheckCXXSourceRuns)
set(SUPPORTED_COMPILERS Clang;GNU;Intel;IntelLLVM)
if(CMAKE_CXX_COMPILER_ID IN_LIST SUPPORTED_COMPILERS)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(CMAKE_REQUIRED_FLAGS "-xHost") # ICC
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
set(CMAKE_REQUIRED_FLAGS "-march=native") # ICX
else()
set(CMAKE_REQUIRED_FLAGS "-march=native") # GCC/Clang
endif()
else()
message(FATAL_ERROR "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}.")
endif()
# AVX check
check_cxx_source_runs(
"
#include <immintrin.h>
int main() {
__m256 a = _mm256_setzero_ps(); // AVX
(void) a;
return 0;
}
"
HAVE_AVX)
if(HAVE_AVX)
# AVX2 check
check_cxx_source_runs(
"
#include <immintrin.h>
int main() {
__m256i a = _mm256_set1_epi32(-1);
__m256i b = _mm256_abs_epi32(a); // AVX2
(void) b;
return 0;
}
"
HAVE_AVX2)
endif()

View File

@@ -0,0 +1,5 @@
#pragma once
#cmakedefine TRIGDX_USE_MKL
#cmakedefine TRIGDX_USE_GPU
#cmakedefine TRIGDX_USE_XSIMD

View File

@@ -11,7 +11,8 @@ public:
GPUBackend(); GPUBackend();
~GPUBackend() override; ~GPUBackend() override;
void init(size_t n = 0) override; void *allocate_memory(size_t bytes) const override;
void free_memory(void *ptr) const override;
void compute_sinf(size_t n, const float *x, float *s) const override; void compute_sinf(size_t n, const float *x, float *s) const override;
void compute_cosf(size_t n, const float *x, float *c) const override; void compute_cosf(size_t n, const float *x, float *c) const override;
void compute_sincosf(size_t n, const float *x, float *s, void compute_sincosf(size_t n, const float *x, float *s,

View File

@@ -1,6 +1,8 @@
#pragma once #pragma once
#include <cstddef> #include <cstddef>
#include <cstdint>
#include <cstdlib>
// Base interface for all math backends // Base interface for all math backends
class Backend { class Backend {
@@ -10,6 +12,12 @@ public:
// Optional initialization // Optional initialization
virtual void init(size_t n = 0) {} virtual void init(size_t n = 0) {}
virtual void *allocate_memory(size_t bytes) const {
return static_cast<void *>(new uint8_t[bytes]);
};
virtual void free_memory(void *ptr) const { std::free(ptr); };
// Compute sine for n elements // Compute sine for n elements
virtual void compute_sinf(size_t n, const float *x, float *s) const = 0; virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;

19
include/trigdx/trigdx.hpp Normal file
View File

@@ -0,0 +1,19 @@
#pragma once
#include <trigdx/trigdx_config.hpp>
#include <trigdx/lookup.hpp>
#include <trigdx/lookup_avx.hpp>
#include <trigdx/reference.hpp>
#if defined(TRIGDX_USE_MKL)
#include <trigdx/mkl.hpp>
#endif
#if defined(TRIGDX_USE_GPU)
#include <trigdx/gpu.hpp>
#endif
#if defined(TRIGDX_USE_XSIMD)
#include <trigdx/lookup_xsimd.hpp>
#endif

23
python/CMakeLists.txt Normal file
View File

@@ -0,0 +1,23 @@
find_package(pybind11 CONFIG QUIET)
if(NOT pybind11_FOUND)
FetchContent_Declare(
pybind11
GIT_REPOSITORY https://github.com/pybind/pybind11.git
GIT_TAG v3.0.0)
FetchContent_MakeAvailable(pybind11)
endif()
# Needed to set ${Python_VERSION_MAJOR} and ${Python_VERSION_MINOR}
find_package(Python REQUIRED)
pybind11_add_module(pytrigdx bindings.cpp)
target_link_libraries(pytrigdx PRIVATE trigdx)
set_target_properties(pytrigdx PROPERTIES OUTPUT_NAME "trigdx")
set(PYTHON_SITE_PACKAGES
"${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/trigdx"
)
install(TARGETS pytrigdx DESTINATION ${PYTHON_SITE_PACKAGES})
install(FILES __init__.py DESTINATION ${PYTHON_SITE_PACKAGES})

16
python/__init__.py Normal file
View File

@@ -0,0 +1,16 @@
from .trigdx import Reference, Lookup16K, Lookup32K, LookupAVX16K, LookupAVX32K
try:
from .trigdx import MKL
except ImportError:
pass
try:
from .trigdx import GPU
except ImportError:
pass
try:
from .trigdx import LookupXSIMD16K, LookupXSIMD32K
except ImportError:
pass

96
python/bindings.cpp Normal file
View File

@@ -0,0 +1,96 @@
#include <memory>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <trigdx/trigdx.hpp>
namespace py = pybind11;
template <typename T>
py::array_t<T>
compute_sin(const Backend &backend,
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
const size_t n = x.shape(0);
if (x.ndim() != 1) {
throw py::value_error("Input array must be 1-dimensional");
}
const T *x_ptr = x.data();
py::array_t<float> s(n);
T *s_ptr = s.mutable_data();
backend.compute_sinf(n, x_ptr, s_ptr);
return s;
}
template <typename T>
py::array_t<T>
compute_cos(const Backend &backend,
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
const size_t n = x.shape(0);
if (x.ndim() != 1) {
throw py::value_error("Input array must be 1-dimensional");
}
const T *x_ptr = x.data();
py::array_t<T> c(n);
T *c_ptr = c.mutable_data();
backend.compute_cosf(n, x_ptr, c_ptr);
return c;
}
template <typename T>
std::tuple<py::array_t<T>, py::array_t<T>>
compute_sincos(const Backend &backend,
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
const size_t n = x.shape(0);
if (x.ndim() != 1) {
throw py::value_error("Input array must be 1-dimensional");
}
const T *x_ptr = x.data();
py::array_t<T> s(n);
py::array_t<T> c(n);
backend.compute_sincosf(n, x_ptr, s.mutable_data(), c.mutable_data());
return std::make_tuple(s, c);
}
template <typename BackendType>
void bind_backend(py::module &m, const char *name) {
py::class_<BackendType, Backend, std::shared_ptr<BackendType>>(m, name)
.def(py::init<>())
.def("compute_sinf", &compute_sin<float>)
.def("compute_cosf", &compute_cos<float>)
.def("compute_sincosf", &compute_sincos<float>);
}
PYBIND11_MODULE(trigdx, m) {
m.doc() = "TrigDx python bindings";
py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend")
.def("init", &Backend::init);
bind_backend<ReferenceBackend>(m, "Reference");
bind_backend<LookupBackend<16384>>(m, "Lookup16K");
bind_backend<LookupBackend<32768>>(m, "Lookup32K");
bind_backend<LookupAVXBackend<16384>>(m, "LookupAVX16K");
bind_backend<LookupAVXBackend<32768>>(m, "LookupAVX32K");
#if defined(TRIGDX_USE_MKL)
bind_backend<MKLBackend>(m, "MKL");
#endif
#if defined(TRIGDX_USE_GPU)
bind_backend<GPUBackend>(m, "GPU");
#endif
#if defined(TRIGDX_USE_XSIMD)
bind_backend<LookupXSIMDBackend<16384>>(m, "LookupXSIMD16K");
bind_backend<LookupXSIMDBackend<32768>>(m, "LookupXSIMD32K");
#endif
}

View File

@@ -1,16 +1,20 @@
add_library(trigdx reference.cpp lookup.cpp lookup_avx.cpp) include(FetchContent)
include(FindAVX)
add_library(trigdx reference.cpp lookup.cpp)
target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include) target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
target_compile_options(trigdx PRIVATE -O3 -march=native) if(HAVE_AVX)
target_sources(trigdx PRIVATE lookup_avx.cpp)
endif()
if(USE_MKL) if(TRIGDX_USE_MKL)
find_package(MKL REQUIRED) find_package(MKL REQUIRED)
target_sources(trigdx PRIVATE mkl.cpp) target_sources(trigdx PRIVATE mkl.cpp)
target_link_libraries(trigdx PRIVATE MKL::MKL) target_link_libraries(trigdx PRIVATE MKL::MKL)
endif() endif()
if(USE_GPU) if(TRIGDX_USE_GPU)
enable_language(CUDA) enable_language(CUDA)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
add_library(gpu SHARED gpu/gpu.cu) add_library(gpu SHARED gpu/gpu.cu)
@@ -19,8 +23,20 @@ if(USE_GPU)
target_link_libraries(trigdx PRIVATE gpu) target_link_libraries(trigdx PRIVATE gpu)
endif() endif()
if(USE_XSIMD) if(TRIGDX_USE_XSIMD)
find_package(xsimd REQUIRED) # Requires XSIMD > 13 for architecture independent dispatching
find_package(xsimd 13 QUIET)
if(NOT TARGET xsimd)
FetchContent_Declare(
xsimd
GIT_REPOSITORY https://github.com/xtensor-stack/xsimd.git
GIT_TAG 13.2.0)
FetchContent_MakeAvailable(xsimd)
endif()
target_sources(trigdx PRIVATE lookup_xsimd.cpp) target_sources(trigdx PRIVATE lookup_xsimd.cpp)
target_link_libraries(trigdx PRIVATE xsimd) target_link_libraries(trigdx PRIVATE xsimd)
endif() endif()
target_include_directories(
trigdx INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>)

View File

@@ -10,75 +10,63 @@
struct GPUBackend::Impl { struct GPUBackend::Impl {
~Impl() { void *allocate_memory(size_t bytes) const {
if (h_x) { void *ptr;
cudaFreeHost(h_x); cudaMallocHost(&ptr, bytes);
} return ptr;
if (h_s) {
cudaFreeHost(h_s);
}
if (h_c) {
cudaFreeHost(h_c);
}
if (d_x) {
cudaFree(d_x);
}
if (d_s) {
cudaFree(d_s);
}
if (d_c) {
cudaFree(d_c);
}
} }
void init(size_t n) { void free_memory(void *ptr) const { cudaFreeHost(ptr); }
const size_t bytes = n * sizeof(float);
cudaMallocHost(&h_x, bytes);
cudaMallocHost(&h_s, bytes);
cudaMallocHost(&h_c, bytes);
cudaMalloc(&d_x, bytes);
cudaMalloc(&d_s, bytes);
cudaMalloc(&d_c, bytes);
}
void compute_sinf(size_t n, const float *x, float *s) const { void compute_sinf(size_t n, const float *x, float *s) const {
const size_t bytes = n * sizeof(float); const size_t bytes = n * sizeof(float);
std::memcpy(h_x, x, bytes); float *d_x, *d_s;
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); cudaMalloc(&d_x, bytes);
launch_sincosf_kernel(d_x, d_s, d_c, n); cudaMalloc(&d_s, bytes);
cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost); cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
std::memcpy(s, h_s, bytes); launch_sinf_kernel(d_x, d_s, n);
cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_s);
} }
void compute_cosf(size_t n, const float *x, float *c) const { void compute_cosf(size_t n, const float *x, float *c) const {
const size_t bytes = n * sizeof(float); const size_t bytes = n * sizeof(float);
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); float *d_x, *d_c;
launch_sincosf_kernel(d_x, d_s, d_c, n); cudaMalloc(&d_x, bytes);
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost); cudaMalloc(&d_c, bytes);
std::memcpy(c, h_c, bytes); cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_cosf_kernel(d_x, d_c, n);
cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_c);
} }
void compute_sincosf(size_t n, const float *x, float *s, float *c) const { void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
const size_t bytes = n * sizeof(float); const size_t bytes = n * sizeof(float);
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); float *d_x, *d_s, *d_c;
cudaMalloc(&d_x, bytes);
cudaMalloc(&d_s, bytes);
cudaMalloc(&d_c, bytes);
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_sincosf_kernel(d_x, d_s, d_c, n); launch_sincosf_kernel(d_x, d_s, d_c, n);
cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost); cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost); cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_s);
cudaFree(d_c);
} }
float *h_x = nullptr;
float *h_s = nullptr;
float *h_c = nullptr;
float *d_x = nullptr;
float *d_s = nullptr;
float *d_c = nullptr;
}; };
GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {} GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
GPUBackend::~GPUBackend() = default; GPUBackend::~GPUBackend() = default;
void GPUBackend::init(size_t n) { impl->init(n); } void *GPUBackend::allocate_memory(size_t bytes) const {
return impl->allocate_memory(bytes);
}
void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }
void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const { void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
impl->compute_sinf(n, x, s); impl->compute_sinf(n, x, s);

View File

@@ -160,7 +160,6 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
for (std::size_t i = 0; i < n; ++i) { for (std::size_t i = 0; i < n; ++i) {
std::size_t idx = static_cast<std::size_t>(x[i] * SCALE) & MASK; std::size_t idx = static_cast<std::size_t>(x[i] * SCALE) & MASK;
std::size_t idx_cos = (idx + NR_SAMPLES / 4) & MASK; std::size_t idx_cos = (idx + NR_SAMPLES / 4) & MASK;
s[i] = lookup[idx];
c[i] = lookup[idx_cos]; c[i] = lookup[idx_cos];
} }
#endif #endif

View File

@@ -8,12 +8,20 @@
template <std::size_t NR_SAMPLES> struct lookup_table { template <std::size_t NR_SAMPLES> struct lookup_table {
static constexpr std::size_t MASK = NR_SAMPLES - 1; static constexpr std::size_t MASK = NR_SAMPLES - 1;
static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI)); static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI));
lookup_table() : values{} { static constexpr float PI_FRAC = 2.0f * M_PIf32 / NR_SAMPLES;
static constexpr float TERM1 = 1.0f; // 1
static constexpr float TERM2 = 0.5f; // 1/2!
static constexpr float TERM3 = 1.0f / 6.0f; // 1/3!
static constexpr float TERM4 = 1.0f / 24.0f; // 1/4!
constexpr lookup_table() : sin_values{}, cos_values{} {
for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) { for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) {
values[i] = sinf(i * (2.0f * float(M_PI) / NR_SAMPLES)); sin_values[i] = sinf(i * PI_FRAC);
cos_values[i] = cosf(i * PI_FRAC);
} }
} }
std::array<float, NR_SAMPLES> values; std::array<float, NR_SAMPLES> sin_values;
std::array<float, NR_SAMPLES> cos_values;
}; };
template <std::size_t NR_SAMPLES> struct cosf_dispatcher { template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
@@ -25,28 +33,56 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
const m_type quarter_pi = m_type::broadcast(Q_PI); const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
const b_type scaled = xsimd::mul(vx, scale); const b_type scaled = xsimd::mul(vx, scale);
m_type idx = xsimd::to_int(scaled); m_type idx = xsimd::to_int(scaled);
m_type idx_cos = xsimd::add(idx, quarter_pi); const b_type f_idx = xsimd::to_float(idx);
idx_cos = xsimd::bitwise_and(idx_cos, mask); idx = xsimd::bitwise_and(idx, mask);
const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3);
cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
cosv.store(c + i, Tag()); cosv.store(c + i, Tag());
} }
for (; i < n; i++) { for (; i < n; i++) {
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) & std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
lookup_table_.MASK;
std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
c[i] = lookup_table_.values[idx_cos]; std::size_t masked = idx & lookup_table_.MASK;
const float cosv = lookup_table_.cos_values[masked];
const float sinv = lookup_table_.sin_values[masked];
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx;
const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx;
const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3;
c[i] = cosv * cosdx - sinv * sindx;
} }
} }
lookup_table<NR_SAMPLES> lookup_table_; lookup_table<NR_SAMPLES> lookup_table_;
@@ -61,25 +97,53 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
const m_type quarter_pi = m_type::broadcast(Q_PI); const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
const b_type scaled = xsimd::mul(vx, scale); const b_type scaled = xsimd::mul(vx, scale);
m_type idx = xsimd::to_int(scaled); m_type idx = xsimd::to_int(scaled);
idx = xsimd::bitwise_and(idx, mask); b_type f_idx = xsimd::to_float(idx);
const b_type sinv = b_type::gather(lookup_table_.values.data(), idx); const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3);
idx = xsimd::bitwise_and(idx, mask);
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
const b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
sinv.store(s + i, Tag()); sinv.store(s + i, Tag());
} }
for (; i < n; i++) { for (; i < n; i++) {
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) & std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
lookup_table_.MASK; std::size_t masked = idx & lookup_table_.MASK;
s[i] = lookup_table_.values[idx]; const float cosv = lookup_table_.cos_values[masked];
const float sinv = lookup_table_.sin_values[masked];
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx;
const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx;
const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3;
s[i] = sinv * cosdx + cosv * sindx;
} }
} }
lookup_table<NR_SAMPLES> lookup_table_; lookup_table<NR_SAMPLES> lookup_table_;
@@ -94,31 +158,56 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
const b_type scaled = xsimd::mul(vx, scale); const b_type scaled = xsimd::mul(vx, scale);
m_type idx = xsimd::to_int(scaled); m_type idx = xsimd::to_int(scaled);
m_type idx_cos = xsimd::add(idx, quarter_pi); b_type f_idx = xsimd::to_float(idx);
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
const b_type dx2 = xsimd::mul(dx, dx);
const b_type dx3 = xsimd::mul(dx2, dx);
const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4);
idx = xsimd::bitwise_and(idx, mask); idx = xsimd::bitwise_and(idx, mask);
idx_cos = xsimd::bitwise_and(idx_cos, mask); b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
const b_type sinv = b_type::gather(lookup_table_.values.data(), idx); b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3);
sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
sinv.store(s + i, Tag()); sinv.store(s + i, Tag());
cosv.store(c + i, Tag()); cosv.store(c + i, Tag());
} }
for (; i < n; i++) { for (; i < n; i++) {
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) & std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
lookup_table_.MASK; std::size_t masked = idx & lookup_table_.MASK;
std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK; const float cosv = lookup_table_.cos_values[masked];
s[i] = lookup_table_.values[idx]; const float sinv = lookup_table_.sin_values[masked];
c[i] = lookup_table_.values[idx_cos]; const float dx = a[i] - idx * lookup_table_.PI_FRAC;
const float dx2 = dx * dx;
const float dx3 = dx2 * dx;
const float dx4 = dx3 * dx;
const float cosdx =
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
const float sindx = dx - lookup_table_.TERM3 * dx3;
s[i] = sinv * cosdx + cosv * sindx;
c[i] = cosv * cosdx - sinv * sindx;
} }
} }
lookup_table<NR_SAMPLES> lookup_table_; lookup_table<NR_SAMPLES> lookup_table_;

View File

@@ -1,5 +1,3 @@
include(FetchContent)
FetchContent_Declare( FetchContent_Declare(
catch2 catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2.git GIT_REPOSITORY https://github.com/catchorg/Catch2.git
@@ -9,31 +7,31 @@ FetchContent_MakeAvailable(catch2)
# Lookup backend test # Lookup backend test
add_executable(test_lookup test_lookup.cpp) add_executable(test_lookup test_lookup.cpp)
target_link_libraries(test_lookup PRIVATE trigdx Catch2::Catch2WithMain) target_link_libraries(test_lookup PRIVATE trigdx Catch2::Catch2WithMain)
# LookupAVX backend test
add_executable(test_lookup_avx test_lookup_avx.cpp)
target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
# MKL backend test
if(USE_MKL)
add_executable(test_mkl test_mkl.cpp)
target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
endif()
include(CTest)
add_test(NAME test_lookup COMMAND test_lookup) add_test(NAME test_lookup COMMAND test_lookup)
if(USE_MKL) # LookupAVX backend test
if(HAVE_AVX)
add_executable(test_lookup_avx test_lookup_avx.cpp)
target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_lookup_avx COMMAND test_lookup_avx)
endif()
# MKL backend test
if(TRIGDX_USE_MKL)
add_executable(test_mkl test_mkl.cpp)
target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_mkl COMMAND test_mkl) add_test(NAME test_mkl COMMAND test_mkl)
endif() endif()
if(USE_GPU) # GPU backend test
if(TRIGDX_USE_GPU)
add_executable(test_gpu test_gpu.cpp) add_executable(test_gpu test_gpu.cpp)
target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain) target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_gpu COMMAND test_gpu) add_test(NAME test_gpu COMMAND test_gpu)
endif() endif()
if(USE_XSIMD) # XSIMD backend test
if(TRIGDX_USE_XSIMD)
add_executable(test_lookup_xsimd test_lookup_xsimd.cpp) add_executable(test_lookup_xsimd test_lookup_xsimd.cpp)
target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain) target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain)
add_test(NAME test_lookup_xsimd COMMAND test_lookup_xsimd) add_test(NAME test_lookup_xsimd COMMAND test_lookup_xsimd)

View File

@@ -3,8 +3,8 @@
#include "test_utils.hpp" #include "test_utils.hpp"
TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-1f); } TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-6f); }
TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-1f); } TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-6f); }
TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-1f); } TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-6f); }

View File

@@ -4,16 +4,16 @@
#include "test_utils.hpp" #include "test_utils.hpp"
TEST_CASE("sincosf") { TEST_CASE("sincosf") {
test_sincosf<LookupXSIMDBackend<16384>>(1e-2f); test_sincosf<LookupXSIMDBackend<16384>>(1e-6f);
test_sincosf<LookupXSIMDBackend<32768>>(1e-2f); test_sincosf<LookupXSIMDBackend<32768>>(1e-6f);
} }
TEST_CASE("sinf") { TEST_CASE("sinf") {
test_sinf<LookupXSIMDBackend<16384>>(1e-2f); test_sinf<LookupXSIMDBackend<16384>>(1e-6f);
test_sinf<LookupXSIMDBackend<32768>>(1e-2f); test_sinf<LookupXSIMDBackend<32768>>(1e-6f);
} }
TEST_CASE("cosf") { TEST_CASE("cosf") {
test_cosf<LookupXSIMDBackend<16384>>(1e-2f); test_cosf<LookupXSIMDBackend<16384>>(1e-6f);
test_cosf<LookupXSIMDBackend<32768>>(1e-2f); test_cosf<LookupXSIMDBackend<32768>>(1e-6f);
} }

View File

@@ -7,14 +7,17 @@
#include <catch2/matchers/catch_matchers_floating_point.hpp> #include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <trigdx/reference.hpp> #include <trigdx/reference.hpp>
const size_t N = 1e7; const size_t N = 1234;
void init_x(std::vector<float> &x) {
for (size_t i = 0; i < x.size(); ++i) {
x[i] = (i % 360) * 0.0174533f; // degrees to radians
}
}
template <typename Backend> inline void test_sinf(float tol) { template <typename Backend> inline void test_sinf(float tol) {
std::vector<float> x(N), s_ref(N), s(N); std::vector<float> x(N), s_ref(N), s(N);
init_x(x);
for (size_t i = 0; i < N; ++i) {
x[i] = float(i) * 0.01f;
}
ReferenceBackend ref; ReferenceBackend ref;
Backend backend; Backend backend;
@@ -30,10 +33,7 @@ template <typename Backend> inline void test_sinf(float tol) {
template <typename Backend> inline void test_cosf(float tol) { template <typename Backend> inline void test_cosf(float tol) {
std::vector<float> x(N), c_ref(N), c(N); std::vector<float> x(N), c_ref(N), c(N);
init_x(x);
for (size_t i = 0; i < N; ++i) {
x[i] = float(i) * 0.01f;
}
ReferenceBackend ref; ReferenceBackend ref;
Backend backend; Backend backend;
@@ -49,10 +49,7 @@ template <typename Backend> inline void test_cosf(float tol) {
template <typename Backend> inline void test_sincosf(float tol) { template <typename Backend> inline void test_sincosf(float tol) {
std::vector<float> x(N), s_ref(N), c_ref(N), s(N), c(N); std::vector<float> x(N), s_ref(N), c_ref(N), s(N), c(N);
init_x(x);
for (size_t i = 0; i < N; ++i) {
x[i] = float(i) * 0.01f;
}
ReferenceBackend ref; ReferenceBackend ref;
Backend backend; Backend backend;