Compare commits
45 Commits
add-cmake-
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58bc640d6d | ||
|
|
5f00c5d304 | ||
|
|
f85e67e669 | ||
|
|
76998a137a | ||
|
|
500d35070e | ||
|
|
bfe752433f | ||
|
|
8fe8314905 | ||
|
|
9d3af8c202 | ||
|
|
0f7fd06be8 | ||
|
|
112baf447b | ||
|
|
a5ba99ff5f | ||
|
|
77c55d6824 | ||
|
|
c85df5f69c | ||
|
|
9c17e90c77 | ||
|
|
e755c1a454 | ||
|
|
b129d9ffaf | ||
|
|
428f60f6d6 | ||
|
|
0e8ea57025 | ||
|
|
a072ffd12f | ||
|
|
b3467840f9 | ||
|
|
79dc7b4285 | ||
|
|
97692cface | ||
|
|
f40c44d5dd | ||
|
|
0cbefb77b7 | ||
|
|
cdc94ab2cb | ||
|
|
33f98abc48 | ||
|
|
c7ab463b43 | ||
|
|
ebb6d50c0b | ||
|
|
5338f3e135 | ||
|
|
f6575599fd | ||
|
|
cd048e5581 | ||
|
|
eb3806442d | ||
|
|
e58d6dae8d | ||
|
|
a65137322d | ||
|
|
b936b3998e | ||
|
|
0e2d9862d5 | ||
|
|
2f39c5c86e | ||
|
|
5e7aca89bb | ||
|
|
e9a74ef283 | ||
|
|
832da6229d | ||
|
|
b71611ed17 | ||
|
|
5a4e80ea4a | ||
|
|
13f18847bc | ||
|
|
eee382307f | ||
|
|
8eef1e966b |
15
.github/workflows/linting.yml
vendored
Normal file
15
.github/workflows/linting.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
name: linting
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
jobs:
|
||||||
|
format:
|
||||||
|
name: Linting
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
- run: sudo apt-get update
|
||||||
|
- run: sudo apt-get install -y clang-format-14 cmake-format pre-commit
|
||||||
|
- run: pre-commit run -a
|
||||||
89
.github/workflows/test.yml
vendored
Normal file
89
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
name: test
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: [slurm]
|
||||||
|
strategy:
|
||||||
|
matrix: &matrix
|
||||||
|
include:
|
||||||
|
- name: GNU, NVIDIA A4000
|
||||||
|
partition: defq
|
||||||
|
gres: gpu:A4000
|
||||||
|
cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
|
||||||
|
environment_modules: "spack/20250403 intel-oneapi-mkl cuda python"
|
||||||
|
- name: Intel, NVIDIA A4000
|
||||||
|
partition: defq
|
||||||
|
gres: gpu:A4000
|
||||||
|
cmake_flags: "-DTRIGDX_USE_MKL=1 -DTRIGDX_USE_GPU=1 -DTRIGDX_USE_MKL=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=86"
|
||||||
|
environment_modules: "spack/20250403 intel-oneapi-compilers intel-oneapi-mkl cuda python"
|
||||||
|
- name: NVIDIA GH200
|
||||||
|
partition: ghq
|
||||||
|
gres: gpu:GH200
|
||||||
|
cmake_flags: "-DTRIGDX_USE_GPU=1 -DTRIGDX_USE_XSIMD=1 -DCMAKE_CUDA_ARCHITECTURES=90a -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc"
|
||||||
|
env:
|
||||||
|
DEVICE_NAME: ${{ matrix.name }}
|
||||||
|
PARTITION_NAME: ${{ matrix.partition }}
|
||||||
|
CMAKE_FLAGS: ${{ matrix.cmake_flags }}
|
||||||
|
ENVIRONMENT_MODULES: ${{ matrix.environment_modules }}
|
||||||
|
steps:
|
||||||
|
- &cleanup
|
||||||
|
name: "Remove old workspace"
|
||||||
|
run: |
|
||||||
|
rm -rf $GITHUB_WORKSPACE/*
|
||||||
|
mkdir -p $GITHUB_WORKSPACE
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: astron-rd/slurm-action@v1.2
|
||||||
|
with:
|
||||||
|
partition: ${{ matrix.partition }}
|
||||||
|
gres: ${{ matrix.gres }}
|
||||||
|
commands: |
|
||||||
|
module load ${ENVIRONMENT_MODULES}
|
||||||
|
cmake -S . -B build ${CMAKE_FLAGS}
|
||||||
|
make -C build -j
|
||||||
|
- name: Upload build
|
||||||
|
uses: pyTooling/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: build-${{ matrix.name }}
|
||||||
|
path: build
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: [slurm]
|
||||||
|
needs: build
|
||||||
|
strategy:
|
||||||
|
matrix: *matrix
|
||||||
|
env:
|
||||||
|
DEVICE_NAME: ${{ matrix.name }}
|
||||||
|
PARTITION_NAME: ${{ matrix.partition }}
|
||||||
|
steps:
|
||||||
|
- *cleanup
|
||||||
|
- uses: pyTooling/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: build-${{ matrix.name }}
|
||||||
|
- uses: astron-rd/slurm-action@v1.2
|
||||||
|
with:
|
||||||
|
partition: ${{ matrix.partition }}
|
||||||
|
gres: ${{ matrix.gres }}
|
||||||
|
commands: |
|
||||||
|
find build/tests -type f -executable -exec {} \;
|
||||||
|
|
||||||
|
benchmark:
|
||||||
|
runs-on: [slurm]
|
||||||
|
needs: build
|
||||||
|
strategy:
|
||||||
|
matrix: *matrix
|
||||||
|
env:
|
||||||
|
DEVICE_NAME: ${{ matrix.name }}
|
||||||
|
PARTITION_NAME: ${{ matrix.partition }}
|
||||||
|
steps:
|
||||||
|
- *cleanup
|
||||||
|
- uses: pyTooling/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: build-${{ matrix.name }}
|
||||||
|
- uses: astron-rd/slurm-action@v1.2
|
||||||
|
with:
|
||||||
|
partition: ${{ matrix.partition }}
|
||||||
|
gres: ${{ matrix.gres }}
|
||||||
|
commands: |
|
||||||
|
find build/benchmarks -type f -executable -exec {} \;
|
||||||
72
.gitignore
vendored
Normal file
72
.gitignore
vendored
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
# Generated files
|
||||||
|
**/.DS_Store
|
||||||
|
*.slo
|
||||||
|
*.lo
|
||||||
|
*.o
|
||||||
|
*.obj
|
||||||
|
|
||||||
|
# Precompiled Headers
|
||||||
|
*.gch
|
||||||
|
*.pch
|
||||||
|
|
||||||
|
# Compiled Dynamic libraries
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.dll
|
||||||
|
|
||||||
|
# Fortran module files
|
||||||
|
*.mod
|
||||||
|
*.smod
|
||||||
|
|
||||||
|
# Compiled Static libraries
|
||||||
|
*.lai
|
||||||
|
*.la
|
||||||
|
*.a
|
||||||
|
*.lib
|
||||||
|
|
||||||
|
# Executables
|
||||||
|
*.exe
|
||||||
|
*.out
|
||||||
|
*.app
|
||||||
|
|
||||||
|
# CMake-related
|
||||||
|
**/cmake-build-debug
|
||||||
|
**/CMakeCache.txt
|
||||||
|
**/cmake_install.cmake
|
||||||
|
**/install_manifest.txt
|
||||||
|
**/CMakeFiles/
|
||||||
|
**/CTestTestfile.cmake
|
||||||
|
**/Makefile
|
||||||
|
**/*.cbp
|
||||||
|
**/CMakeScripts
|
||||||
|
**/compile_commands.json
|
||||||
|
**/CMakeLists.txt.user
|
||||||
|
**/_deps
|
||||||
|
**/Testing
|
||||||
|
**/CMakeUserPresets.json
|
||||||
|
|
||||||
|
# Local
|
||||||
|
build/**/*
|
||||||
|
lib/*
|
||||||
|
bin/*
|
||||||
|
test/test_runner
|
||||||
|
*.user*
|
||||||
|
|
||||||
|
# Visual Studio work directory
|
||||||
|
.vs/
|
||||||
|
# Visual Studio build directory
|
||||||
|
out/
|
||||||
|
|
||||||
|
# Visual Studio Code
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
# CLion work directory
|
||||||
|
.idea/
|
||||||
|
# CLion build directories
|
||||||
|
cmake-build-*/
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
env
|
||||||
|
venv
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
@@ -7,4 +7,5 @@ repos:
|
|||||||
rev: v0.6.13
|
rev: v0.6.13
|
||||||
hooks:
|
hooks:
|
||||||
- id: cmake-format
|
- id: cmake-format
|
||||||
- id: cmake-lint
|
- id: cmake-lint
|
||||||
|
args: [--disabled-codes=C0301]
|
||||||
|
|||||||
77
.vscode/settings.json
vendored
77
.vscode/settings.json
vendored
@@ -1,77 +0,0 @@
|
|||||||
{
|
|
||||||
"files.associations": {
|
|
||||||
"*.cc": "cpp",
|
|
||||||
"*.tpp": "cpp",
|
|
||||||
"array": "cpp",
|
|
||||||
"atomic": "cpp",
|
|
||||||
"bit": "cpp",
|
|
||||||
"bitset": "cpp",
|
|
||||||
"cctype": "cpp",
|
|
||||||
"charconv": "cpp",
|
|
||||||
"chrono": "cpp",
|
|
||||||
"clocale": "cpp",
|
|
||||||
"cmath": "cpp",
|
|
||||||
"compare": "cpp",
|
|
||||||
"concepts": "cpp",
|
|
||||||
"condition_variable": "cpp",
|
|
||||||
"csignal": "cpp",
|
|
||||||
"cstdarg": "cpp",
|
|
||||||
"cstddef": "cpp",
|
|
||||||
"cstdint": "cpp",
|
|
||||||
"cstdio": "cpp",
|
|
||||||
"cstdlib": "cpp",
|
|
||||||
"cstring": "cpp",
|
|
||||||
"ctime": "cpp",
|
|
||||||
"cwchar": "cpp",
|
|
||||||
"cwctype": "cpp",
|
|
||||||
"deque": "cpp",
|
|
||||||
"list": "cpp",
|
|
||||||
"map": "cpp",
|
|
||||||
"set": "cpp",
|
|
||||||
"string": "cpp",
|
|
||||||
"unordered_map": "cpp",
|
|
||||||
"vector": "cpp",
|
|
||||||
"exception": "cpp",
|
|
||||||
"algorithm": "cpp",
|
|
||||||
"functional": "cpp",
|
|
||||||
"iterator": "cpp",
|
|
||||||
"memory": "cpp",
|
|
||||||
"memory_resource": "cpp",
|
|
||||||
"numeric": "cpp",
|
|
||||||
"optional": "cpp",
|
|
||||||
"random": "cpp",
|
|
||||||
"ratio": "cpp",
|
|
||||||
"regex": "cpp",
|
|
||||||
"string_view": "cpp",
|
|
||||||
"system_error": "cpp",
|
|
||||||
"tuple": "cpp",
|
|
||||||
"type_traits": "cpp",
|
|
||||||
"utility": "cpp",
|
|
||||||
"format": "cpp",
|
|
||||||
"fstream": "cpp",
|
|
||||||
"future": "cpp",
|
|
||||||
"initializer_list": "cpp",
|
|
||||||
"iomanip": "cpp",
|
|
||||||
"iosfwd": "cpp",
|
|
||||||
"iostream": "cpp",
|
|
||||||
"istream": "cpp",
|
|
||||||
"limits": "cpp",
|
|
||||||
"mutex": "cpp",
|
|
||||||
"new": "cpp",
|
|
||||||
"numbers": "cpp",
|
|
||||||
"ostream": "cpp",
|
|
||||||
"ranges": "cpp",
|
|
||||||
"semaphore": "cpp",
|
|
||||||
"span": "cpp",
|
|
||||||
"sstream": "cpp",
|
|
||||||
"stdexcept": "cpp",
|
|
||||||
"stop_token": "cpp",
|
|
||||||
"streambuf": "cpp",
|
|
||||||
"text_encoding": "cpp",
|
|
||||||
"thread": "cpp",
|
|
||||||
"typeinfo": "cpp",
|
|
||||||
"variant": "cpp",
|
|
||||||
"queue": "cpp",
|
|
||||||
"stack": "cpp"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -3,13 +3,45 @@ project(trigdx LANGUAGES CXX)
|
|||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
option(TRIGDX_USE_MKL "Enable Intel MKL backend" OFF)
|
option(TRIGDX_USE_MKL "Enable Intel MKL backend" OFF)
|
||||||
option(TRIGDX_USE_GPU "Enable GPU backend" OFF)
|
option(TRIGDX_USE_GPU "Enable GPU backend" OFF)
|
||||||
option(TRIGDX_USE_XSIMD "Enable XSIMD backend" OFF)
|
option(TRIGDX_USE_XSIMD "Enable XSIMD backend" OFF)
|
||||||
|
option(TRIGDX_BUILD_TESTS "Build tests" ON)
|
||||||
|
option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
|
||||||
|
option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
|
||||||
|
|
||||||
|
# Add compiler flags
|
||||||
|
set(CMAKE_CXX_FLAGS
|
||||||
|
"${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
|
||||||
|
)
|
||||||
|
|
||||||
|
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
|
||||||
|
configure_file(
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/include/trigdx/trigdx_config.hpp @ONLY)
|
||||||
|
|
||||||
|
if(TRIGDX_BUILD_TESTS
|
||||||
|
OR TRIGDX_BUILD_BENCHMARKS
|
||||||
|
OR TRIGDX_BUILD_PYTHON)
|
||||||
|
include(FetchContent)
|
||||||
|
endif()
|
||||||
|
|
||||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
include_directories(${PROJECT_SOURCE_DIR}/include)
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
add_subdirectory(tests)
|
|
||||||
add_subdirectory(benchmarks)
|
if(TRIGDX_BUILD_TESTS)
|
||||||
|
include(CTest)
|
||||||
|
enable_testing()
|
||||||
|
add_subdirectory(tests)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(TRIGDX_BUILD_BENCHMARKS)
|
||||||
|
add_subdirectory(benchmarks)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(TRIGDX_BUILD_PYTHON)
|
||||||
|
add_subdirectory(python)
|
||||||
|
endif()
|
||||||
|
|||||||
54
README.md
Normal file
54
README.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# TrigDx
|
||||||
|
|
||||||
|
High‑performance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signal‑processing, and real‑time systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
|
||||||
|
|
||||||
|
## Why TrigDx?
|
||||||
|
Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
|
||||||
|
|
||||||
|
- Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
|
||||||
|
- Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
|
||||||
|
- Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
- A C++ compiler with at least C++17 support (GCC, Clang)
|
||||||
|
- CMake 3.15+
|
||||||
|
- Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
|
||||||
|
- Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
|
||||||
|
|
||||||
|
## Building
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/astron-rd/TrigDx.git
|
||||||
|
cd TrigDx
|
||||||
|
mkdir build && cd build
|
||||||
|
|
||||||
|
# CPU-only:
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
|
||||||
|
cmake --build . -j
|
||||||
|
|
||||||
|
# Enable CUDA (if available):
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
|
||||||
|
cmake --build . -j
|
||||||
|
|
||||||
|
# Run tests:
|
||||||
|
ctest --output-on-failure -j
|
||||||
|
```
|
||||||
|
|
||||||
|
Common CMake options:
|
||||||
|
- `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
|
||||||
|
- `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
|
||||||
|
- `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
|
||||||
|
- `TRIGDX_BUILD_PYTHON` — build Python interface.
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
- Fork → create a feature branch → open a PR.
|
||||||
|
- Include unit tests for correctness‑sensitive changes and benchmark results for performance changes.
|
||||||
|
- Follow project style (clang‑format) and run tests locally before submitting.
|
||||||
|
|
||||||
|
## Reporting issues
|
||||||
|
When opening an issue for incorrect results or performance regressions, please include:
|
||||||
|
- Platform and CPU/GPU model.
|
||||||
|
- Compiler and version with exact compile flags.
|
||||||
|
- Small reproducer (input data and the TrigDx implementation used).
|
||||||
|
|
||||||
|
## License
|
||||||
|
See the LICENSE file in the repository for licensing details.
|
||||||
@@ -1,23 +1,36 @@
|
|||||||
|
FetchContent_Declare(
|
||||||
|
benchmark
|
||||||
|
GIT_REPOSITORY https://github.com/google/benchmark.git
|
||||||
|
GIT_TAG v1.9.4)
|
||||||
|
set(BENCHMARK_ENABLE_TESTING
|
||||||
|
OFF
|
||||||
|
CACHE BOOL "" FORCE)
|
||||||
|
FetchContent_MakeAvailable(benchmark)
|
||||||
|
|
||||||
add_executable(benchmark_reference benchmark_reference.cpp)
|
add_executable(benchmark_reference benchmark_reference.cpp)
|
||||||
target_link_libraries(benchmark_reference PRIVATE trigdx)
|
target_link_libraries(benchmark_reference PRIVATE trigdx benchmark::benchmark)
|
||||||
|
|
||||||
add_executable(benchmark_lookup benchmark_lookup.cpp)
|
add_executable(benchmark_lookup benchmark_lookup.cpp)
|
||||||
target_link_libraries(benchmark_lookup PRIVATE trigdx)
|
target_link_libraries(benchmark_lookup PRIVATE trigdx benchmark::benchmark)
|
||||||
|
|
||||||
add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
|
if(HAVE_AVX)
|
||||||
target_link_libraries(benchmark_lookup_avx PRIVATE trigdx)
|
add_executable(benchmark_lookup_avx benchmark_lookup_avx.cpp)
|
||||||
|
target_link_libraries(benchmark_lookup_avx PRIVATE trigdx
|
||||||
|
benchmark::benchmark)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(TRIGDX_USE_MKL)
|
if(TRIGDX_USE_MKL)
|
||||||
add_executable(benchmark_mkl benchmark_mkl.cpp)
|
add_executable(benchmark_mkl benchmark_mkl.cpp)
|
||||||
target_link_libraries(benchmark_mkl PRIVATE trigdx)
|
target_link_libraries(benchmark_mkl PRIVATE trigdx benchmark::benchmark)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(TRIGDX_USE_GPU)
|
if(TRIGDX_USE_GPU)
|
||||||
add_executable(benchmark_gpu benchmark_gpu.cpp)
|
add_executable(benchmark_gpu benchmark_gpu.cpp)
|
||||||
target_link_libraries(benchmark_gpu PRIVATE trigdx gpu)
|
target_link_libraries(benchmark_gpu PRIVATE trigdx gpu benchmark::benchmark)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(TRIGDX_USE_XSIMD)
|
if(TRIGDX_USE_XSIMD)
|
||||||
add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp)
|
add_executable(benchmark_lookup_xsimd benchmark_lookup_xsimd.cpp)
|
||||||
target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx)
|
target_link_libraries(benchmark_lookup_xsimd PRIVATE trigdx
|
||||||
|
benchmark::benchmark)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -1,9 +1,21 @@
|
|||||||
#include <trigdx/gpu.hpp>
|
#include <trigdx/trigdx.hpp>
|
||||||
|
|
||||||
#include "benchmark_utils.hpp"
|
#include "benchmark_utils.hpp"
|
||||||
|
|
||||||
int main() {
|
BENCHMARK_TEMPLATE(benchmark_sinf, GPUBackend)
|
||||||
benchmark_sinf<GPUBackend>();
|
->Unit(benchmark::kMillisecond)
|
||||||
benchmark_cosf<GPUBackend>();
|
->Arg(1e5)
|
||||||
benchmark_sincosf<GPUBackend>();
|
->Arg(1e6)
|
||||||
}
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_cosf, GPUBackend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_sincosf, GPUBackend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
|
||||||
|
BENCHMARK_MAIN();
|
||||||
@@ -1,13 +1,30 @@
|
|||||||
#include <trigdx/lookup.hpp>
|
#include <trigdx/trigdx.hpp>
|
||||||
|
|
||||||
#include "benchmark_utils.hpp"
|
#include "benchmark_utils.hpp"
|
||||||
|
|
||||||
int main() {
|
template <typename Backend> void register_benchmarks() {
|
||||||
benchmark_sinf<LookupBackend<16384>>();
|
BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
|
||||||
benchmark_cosf<LookupBackend<16384>>();
|
->Unit(benchmark::kMillisecond)
|
||||||
benchmark_sincosf<LookupBackend<16384>>();
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
benchmark_sinf<LookupBackend<32768>>();
|
->Arg(1e7);
|
||||||
benchmark_cosf<LookupBackend<32768>>();
|
BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
|
||||||
benchmark_sincosf<LookupBackend<32768>>();
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
::benchmark::Initialize(&argc, argv);
|
||||||
|
|
||||||
|
register_benchmarks<LookupBackend<16384>>();
|
||||||
|
register_benchmarks<LookupBackend<32768>>();
|
||||||
|
|
||||||
|
return ::benchmark::RunSpecifiedBenchmarks();
|
||||||
|
}
|
||||||
@@ -1,13 +1,30 @@
|
|||||||
#include <trigdx/lookup_avx.hpp>
|
#include <trigdx/trigdx.hpp>
|
||||||
|
|
||||||
#include "benchmark_utils.hpp"
|
#include "benchmark_utils.hpp"
|
||||||
|
|
||||||
int main() {
|
template <typename Backend> void register_benchmarks() {
|
||||||
benchmark_sinf<LookupAVXBackend<16384>>();
|
BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
|
||||||
benchmark_cosf<LookupAVXBackend<16384>>();
|
->Unit(benchmark::kMillisecond)
|
||||||
benchmark_sincosf<LookupAVXBackend<16384>>();
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
benchmark_sinf<LookupAVXBackend<32768>>();
|
->Arg(1e7);
|
||||||
benchmark_cosf<LookupAVXBackend<32768>>();
|
BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
|
||||||
benchmark_sincosf<LookupAVXBackend<32768>>();
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
::benchmark::Initialize(&argc, argv);
|
||||||
|
|
||||||
|
register_benchmarks<LookupAVXBackend<16384>>();
|
||||||
|
register_benchmarks<LookupAVXBackend<32768>>();
|
||||||
|
|
||||||
|
return ::benchmark::RunSpecifiedBenchmarks();
|
||||||
|
}
|
||||||
@@ -1,13 +1,30 @@
|
|||||||
#include <trigdx/lookup_xsimd.hpp>
|
#include <trigdx/trigdx.hpp>
|
||||||
|
|
||||||
#include "benchmark_utils.hpp"
|
#include "benchmark_utils.hpp"
|
||||||
|
|
||||||
int main() {
|
template <typename Backend> void register_benchmarks() {
|
||||||
benchmark_sinf<LookupXSIMDBackend<16384>>();
|
BENCHMARK_TEMPLATE(benchmark_sinf, Backend)
|
||||||
benchmark_cosf<LookupXSIMDBackend<16384>>();
|
->Unit(benchmark::kMillisecond)
|
||||||
benchmark_sincosf<LookupXSIMDBackend<16384>>();
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
benchmark_sinf<LookupXSIMDBackend<32768>>();
|
->Arg(1e7);
|
||||||
benchmark_cosf<LookupXSIMDBackend<32768>>();
|
BENCHMARK_TEMPLATE(benchmark_cosf, Backend)
|
||||||
benchmark_sincosf<LookupXSIMDBackend<32768>>();
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_sincosf, Backend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
::benchmark::Initialize(&argc, argv);
|
||||||
|
|
||||||
|
register_benchmarks<LookupXSIMDBackend<16384>>();
|
||||||
|
register_benchmarks<LookupXSIMDBackend<32768>>();
|
||||||
|
|
||||||
|
return ::benchmark::RunSpecifiedBenchmarks();
|
||||||
|
}
|
||||||
@@ -1,9 +1,21 @@
|
|||||||
#include <trigdx/mkl.hpp>
|
#include <trigdx/trigdx.hpp>
|
||||||
|
|
||||||
#include "benchmark_utils.hpp"
|
#include "benchmark_utils.hpp"
|
||||||
|
|
||||||
int main() {
|
BENCHMARK_TEMPLATE(benchmark_sinf, MKLBackend)
|
||||||
benchmark_sinf<MKLBackend>();
|
->Unit(benchmark::kMillisecond)
|
||||||
benchmark_cosf<MKLBackend>();
|
->Arg(1e5)
|
||||||
benchmark_sincosf<MKLBackend>();
|
->Arg(1e6)
|
||||||
}
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_cosf, MKLBackend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_sincosf, MKLBackend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
|
||||||
|
BENCHMARK_MAIN();
|
||||||
@@ -1,9 +1,21 @@
|
|||||||
#include <trigdx/reference.hpp>
|
#include <trigdx/trigdx.hpp>
|
||||||
|
|
||||||
#include "benchmark_utils.hpp"
|
#include "benchmark_utils.hpp"
|
||||||
|
|
||||||
int main() {
|
BENCHMARK_TEMPLATE(benchmark_sinf, ReferenceBackend)
|
||||||
benchmark_sinf<ReferenceBackend>();
|
->Unit(benchmark::kMillisecond)
|
||||||
benchmark_cosf<ReferenceBackend>();
|
->Arg(1e5)
|
||||||
benchmark_sincosf<ReferenceBackend>();
|
->Arg(1e6)
|
||||||
}
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_cosf, ReferenceBackend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
BENCHMARK_TEMPLATE(benchmark_sincosf, ReferenceBackend)
|
||||||
|
->Unit(benchmark::kMillisecond)
|
||||||
|
->Arg(1e5)
|
||||||
|
->Arg(1e6)
|
||||||
|
->Arg(1e7);
|
||||||
|
|
||||||
|
BENCHMARK_MAIN();
|
||||||
@@ -1,76 +1,122 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <iomanip>
|
#include <cmath>
|
||||||
#include <iostream>
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
const size_t N = 1e7;
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
inline void report(const std::string &name, double sec, double throughput) {
|
void init_x(float *x, size_t n) {
|
||||||
std::ios state(nullptr);
|
for (size_t i = 0; i < n; ++i) {
|
||||||
state.copyfmt(std::cout);
|
x[i] = (i % 360) * 0.0174533f; // degrees to radians
|
||||||
std::cout << std::setw(7) << name << " -> ";
|
}
|
||||||
std::cout << "time: ";
|
|
||||||
std::cout << std::fixed << std::setprecision(3) << std::setfill('0');
|
|
||||||
std::cout << sec << " s, ";
|
|
||||||
std::cout << "throughput: " << throughput << " M elems/sec\n";
|
|
||||||
std::cout.copyfmt(state);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Backend> inline void benchmark_sinf() {
|
template <typename Backend>
|
||||||
std::vector<float> x(N), s(N);
|
static void benchmark_sinf(benchmark::State &state) {
|
||||||
|
const size_t N = static_cast<size_t>(state.range(0));
|
||||||
for (size_t i = 0; i < N; ++i)
|
|
||||||
x[i] = (i % 360) * 0.0174533f; // degrees to radians
|
|
||||||
|
|
||||||
Backend backend;
|
Backend backend;
|
||||||
backend.init(N);
|
|
||||||
|
|
||||||
auto start = std::chrono::high_resolution_clock::now();
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
backend.compute_sinf(N, x.data(), s.data());
|
backend.init(N);
|
||||||
|
float *x =
|
||||||
|
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
|
||||||
|
float *s =
|
||||||
|
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
state.counters["init_ms"] =
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
||||||
|
.count() /
|
||||||
|
1.e3;
|
||||||
|
|
||||||
double sec = std::chrono::duration<double>(end - start).count();
|
init_x(x, N);
|
||||||
double throughput = N / sec / 1e6;
|
|
||||||
|
|
||||||
report("sinf", sec, throughput);
|
for (auto _ : state) {
|
||||||
|
backend.compute_sinf(N, x, s);
|
||||||
|
benchmark::DoNotOptimize(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
backend.free_memory(x);
|
||||||
|
backend.free_memory(s);
|
||||||
|
|
||||||
|
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
|
||||||
|
static_cast<int64_t>(N));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Backend> inline void benchmark_cosf() {
|
template <typename Backend>
|
||||||
std::vector<float> x(N), c(N);
|
static void benchmark_cosf(benchmark::State &state) {
|
||||||
|
const size_t N = static_cast<size_t>(state.range(0));
|
||||||
for (size_t i = 0; i < N; ++i)
|
|
||||||
x[i] = (i % 360) * 0.0174533f; // degrees to radians
|
|
||||||
|
|
||||||
Backend backend;
|
Backend backend;
|
||||||
backend.init(N);
|
|
||||||
|
|
||||||
auto start = std::chrono::high_resolution_clock::now();
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
backend.compute_cosf(N, x.data(), c.data());
|
backend.init(N);
|
||||||
|
float *x =
|
||||||
|
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
|
||||||
|
float *c =
|
||||||
|
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
|
||||||
|
|
||||||
|
if (!x || !c) {
|
||||||
|
throw std::runtime_error("Buffer allocation failed");
|
||||||
|
}
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
state.counters["init_ms"] =
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
||||||
|
.count() /
|
||||||
|
1.e3;
|
||||||
|
|
||||||
double sec = std::chrono::duration<double>(end - start).count();
|
init_x(x, N);
|
||||||
double throughput = N / sec / 1e6;
|
|
||||||
|
|
||||||
report("cosf", sec, throughput);
|
for (auto _ : state) {
|
||||||
|
backend.compute_cosf(N, x, c);
|
||||||
|
benchmark::DoNotOptimize(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
backend.free_memory(x);
|
||||||
|
backend.free_memory(c);
|
||||||
|
|
||||||
|
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
|
||||||
|
static_cast<int64_t>(N));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Backend> inline void benchmark_sincosf() {
|
template <typename Backend>
|
||||||
std::vector<float> x(N), s(N), c(N);
|
static void benchmark_sincosf(benchmark::State &state) {
|
||||||
|
const size_t N = static_cast<size_t>(state.range(0));
|
||||||
for (size_t i = 0; i < N; ++i)
|
|
||||||
x[i] = (i % 360) * 0.0174533f; // degrees to radians
|
|
||||||
|
|
||||||
Backend backend;
|
Backend backend;
|
||||||
backend.init(N);
|
|
||||||
|
|
||||||
auto start = std::chrono::high_resolution_clock::now();
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
backend.compute_sincosf(N, x.data(), s.data(), c.data());
|
backend.init(N);
|
||||||
|
float *x =
|
||||||
|
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
|
||||||
|
float *s =
|
||||||
|
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
|
||||||
|
float *c =
|
||||||
|
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
|
||||||
|
if (!x || !s || !c) {
|
||||||
|
throw std::runtime_error("Buffer allocation failed");
|
||||||
|
}
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
state.counters["init_ms"] =
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
||||||
|
.count() /
|
||||||
|
1.e3;
|
||||||
|
|
||||||
double sec = std::chrono::duration<double>(end - start).count();
|
init_x(x, N);
|
||||||
double throughput = N / sec / 1e6;
|
|
||||||
|
|
||||||
report("sincosf", sec, throughput);
|
for (auto _ : state) {
|
||||||
|
backend.compute_sincosf(N, x, s, c);
|
||||||
|
benchmark::DoNotOptimize(s);
|
||||||
|
benchmark::DoNotOptimize(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
backend.free_memory(x);
|
||||||
|
backend.free_memory(s);
|
||||||
|
backend.free_memory(c);
|
||||||
|
|
||||||
|
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
|
||||||
|
static_cast<int64_t>(N));
|
||||||
}
|
}
|
||||||
|
|||||||
42
cmake/FindAVX.cmake
Normal file
42
cmake/FindAVX.cmake
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
include(CheckCXXSourceRuns)
|
||||||
|
|
||||||
|
set(SUPPORTED_COMPILERS Clang;GNU;Intel;IntelLLVM)
|
||||||
|
|
||||||
|
if(CMAKE_CXX_COMPILER_ID IN_LIST SUPPORTED_COMPILERS)
|
||||||
|
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
|
||||||
|
set(CMAKE_REQUIRED_FLAGS "-xHost") # ICC
|
||||||
|
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
|
||||||
|
set(CMAKE_REQUIRED_FLAGS "-march=native") # ICX
|
||||||
|
else()
|
||||||
|
set(CMAKE_REQUIRED_FLAGS "-march=native") # GCC/Clang
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# AVX check
|
||||||
|
check_cxx_source_runs(
|
||||||
|
"
|
||||||
|
#include <immintrin.h>
|
||||||
|
int main() {
|
||||||
|
__m256 a = _mm256_setzero_ps(); // AVX
|
||||||
|
(void) a;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
"
|
||||||
|
HAVE_AVX)
|
||||||
|
|
||||||
|
if(HAVE_AVX)
|
||||||
|
# AVX2 check
|
||||||
|
check_cxx_source_runs(
|
||||||
|
"
|
||||||
|
#include <immintrin.h>
|
||||||
|
int main() {
|
||||||
|
__m256i a = _mm256_set1_epi32(-1);
|
||||||
|
__m256i b = _mm256_abs_epi32(a); // AVX2
|
||||||
|
(void) b;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
"
|
||||||
|
HAVE_AVX2)
|
||||||
|
endif()
|
||||||
5
cmake/trigdx_config.hpp.in
Normal file
5
cmake/trigdx_config.hpp.in
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#cmakedefine TRIGDX_USE_MKL
|
||||||
|
#cmakedefine TRIGDX_USE_GPU
|
||||||
|
#cmakedefine TRIGDX_USE_XSIMD
|
||||||
@@ -11,7 +11,8 @@ public:
|
|||||||
GPUBackend();
|
GPUBackend();
|
||||||
~GPUBackend() override;
|
~GPUBackend() override;
|
||||||
|
|
||||||
void init(size_t n = 0) override;
|
void *allocate_memory(size_t bytes) const override;
|
||||||
|
void free_memory(void *ptr) const override;
|
||||||
void compute_sinf(size_t n, const float *x, float *s) const override;
|
void compute_sinf(size_t n, const float *x, float *s) const override;
|
||||||
void compute_cosf(size_t n, const float *x, float *c) const override;
|
void compute_cosf(size_t n, const float *x, float *c) const override;
|
||||||
void compute_sincosf(size_t n, const float *x, float *s,
|
void compute_sincosf(size_t n, const float *x, float *s,
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
// Base interface for all math backends
|
// Base interface for all math backends
|
||||||
class Backend {
|
class Backend {
|
||||||
@@ -10,6 +12,12 @@ public:
|
|||||||
// Optional initialization
|
// Optional initialization
|
||||||
virtual void init(size_t n = 0) {}
|
virtual void init(size_t n = 0) {}
|
||||||
|
|
||||||
|
virtual void *allocate_memory(size_t bytes) const {
|
||||||
|
return static_cast<void *>(new uint8_t[bytes]);
|
||||||
|
};
|
||||||
|
|
||||||
|
virtual void free_memory(void *ptr) const { std::free(ptr); };
|
||||||
|
|
||||||
// Compute sine for n elements
|
// Compute sine for n elements
|
||||||
virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;
|
virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;
|
||||||
|
|
||||||
|
|||||||
19
include/trigdx/trigdx.hpp
Normal file
19
include/trigdx/trigdx.hpp
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <trigdx/trigdx_config.hpp>
|
||||||
|
|
||||||
|
#include <trigdx/lookup.hpp>
|
||||||
|
#include <trigdx/lookup_avx.hpp>
|
||||||
|
#include <trigdx/reference.hpp>
|
||||||
|
|
||||||
|
#if defined(TRIGDX_USE_MKL)
|
||||||
|
#include <trigdx/mkl.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(TRIGDX_USE_GPU)
|
||||||
|
#include <trigdx/gpu.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(TRIGDX_USE_XSIMD)
|
||||||
|
#include <trigdx/lookup_xsimd.hpp>
|
||||||
|
#endif
|
||||||
23
python/CMakeLists.txt
Normal file
23
python/CMakeLists.txt
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
find_package(pybind11 CONFIG QUIET)
|
||||||
|
|
||||||
|
if(NOT pybind11_FOUND)
|
||||||
|
FetchContent_Declare(
|
||||||
|
pybind11
|
||||||
|
GIT_REPOSITORY https://github.com/pybind/pybind11.git
|
||||||
|
GIT_TAG v3.0.0)
|
||||||
|
FetchContent_MakeAvailable(pybind11)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Needed to set ${Python_VERSION_MAJOR} and ${Python_VERSION_MINOR}
|
||||||
|
find_package(Python REQUIRED)
|
||||||
|
|
||||||
|
pybind11_add_module(pytrigdx bindings.cpp)
|
||||||
|
target_link_libraries(pytrigdx PRIVATE trigdx)
|
||||||
|
set_target_properties(pytrigdx PROPERTIES OUTPUT_NAME "trigdx")
|
||||||
|
|
||||||
|
set(PYTHON_SITE_PACKAGES
|
||||||
|
"${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/trigdx"
|
||||||
|
)
|
||||||
|
|
||||||
|
install(TARGETS pytrigdx DESTINATION ${PYTHON_SITE_PACKAGES})
|
||||||
|
install(FILES __init__.py DESTINATION ${PYTHON_SITE_PACKAGES})
|
||||||
16
python/__init__.py
Normal file
16
python/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from .trigdx import Reference, Lookup16K, Lookup32K, LookupAVX16K, LookupAVX32K
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .trigdx import MKL
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .trigdx import GPU
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .trigdx import LookupXSIMD16K, LookupXSIMD32K
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
96
python/bindings.cpp
Normal file
96
python/bindings.cpp
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include <pybind11/numpy.h>
|
||||||
|
#include <pybind11/pybind11.h>
|
||||||
|
#include <trigdx/trigdx.hpp>
|
||||||
|
|
||||||
|
namespace py = pybind11;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
py::array_t<T>
|
||||||
|
compute_sin(const Backend &backend,
|
||||||
|
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
|
||||||
|
const size_t n = x.shape(0);
|
||||||
|
if (x.ndim() != 1) {
|
||||||
|
throw py::value_error("Input array must be 1-dimensional");
|
||||||
|
}
|
||||||
|
|
||||||
|
const T *x_ptr = x.data();
|
||||||
|
|
||||||
|
py::array_t<float> s(n);
|
||||||
|
T *s_ptr = s.mutable_data();
|
||||||
|
|
||||||
|
backend.compute_sinf(n, x_ptr, s_ptr);
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
py::array_t<T>
|
||||||
|
compute_cos(const Backend &backend,
|
||||||
|
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
|
||||||
|
const size_t n = x.shape(0);
|
||||||
|
if (x.ndim() != 1) {
|
||||||
|
throw py::value_error("Input array must be 1-dimensional");
|
||||||
|
}
|
||||||
|
|
||||||
|
const T *x_ptr = x.data();
|
||||||
|
|
||||||
|
py::array_t<T> c(n);
|
||||||
|
T *c_ptr = c.mutable_data();
|
||||||
|
|
||||||
|
backend.compute_cosf(n, x_ptr, c_ptr);
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
std::tuple<py::array_t<T>, py::array_t<T>>
|
||||||
|
compute_sincos(const Backend &backend,
|
||||||
|
py::array_t<T, py::array::c_style | py::array::forcecast> x) {
|
||||||
|
const size_t n = x.shape(0);
|
||||||
|
if (x.ndim() != 1) {
|
||||||
|
throw py::value_error("Input array must be 1-dimensional");
|
||||||
|
}
|
||||||
|
|
||||||
|
const T *x_ptr = x.data();
|
||||||
|
|
||||||
|
py::array_t<T> s(n);
|
||||||
|
py::array_t<T> c(n);
|
||||||
|
|
||||||
|
backend.compute_sincosf(n, x_ptr, s.mutable_data(), c.mutable_data());
|
||||||
|
|
||||||
|
return std::make_tuple(s, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename BackendType>
|
||||||
|
void bind_backend(py::module &m, const char *name) {
|
||||||
|
py::class_<BackendType, Backend, std::shared_ptr<BackendType>>(m, name)
|
||||||
|
.def(py::init<>())
|
||||||
|
.def("compute_sinf", &compute_sin<float>)
|
||||||
|
.def("compute_cosf", &compute_cos<float>)
|
||||||
|
.def("compute_sincosf", &compute_sincos<float>);
|
||||||
|
}
|
||||||
|
|
||||||
|
PYBIND11_MODULE(trigdx, m) {
|
||||||
|
m.doc() = "TrigDx python bindings";
|
||||||
|
|
||||||
|
py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend")
|
||||||
|
.def("init", &Backend::init);
|
||||||
|
|
||||||
|
bind_backend<ReferenceBackend>(m, "Reference");
|
||||||
|
bind_backend<LookupBackend<16384>>(m, "Lookup16K");
|
||||||
|
bind_backend<LookupBackend<32768>>(m, "Lookup32K");
|
||||||
|
bind_backend<LookupAVXBackend<16384>>(m, "LookupAVX16K");
|
||||||
|
bind_backend<LookupAVXBackend<32768>>(m, "LookupAVX32K");
|
||||||
|
#if defined(TRIGDX_USE_MKL)
|
||||||
|
bind_backend<MKLBackend>(m, "MKL");
|
||||||
|
#endif
|
||||||
|
#if defined(TRIGDX_USE_GPU)
|
||||||
|
bind_backend<GPUBackend>(m, "GPU");
|
||||||
|
#endif
|
||||||
|
#if defined(TRIGDX_USE_XSIMD)
|
||||||
|
bind_backend<LookupXSIMDBackend<16384>>(m, "LookupXSIMD16K");
|
||||||
|
bind_backend<LookupXSIMDBackend<32768>>(m, "LookupXSIMD32K");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
@@ -1,8 +1,30 @@
|
|||||||
add_library(trigdx reference.cpp lookup.cpp lookup_avx.cpp)
|
include(FetchContent)
|
||||||
|
include(FindAVX)
|
||||||
|
add_library(trigdx reference.cpp lookup.cpp)
|
||||||
|
|
||||||
|
if(HAVE_AVX2)
|
||||||
|
target_compile_definitions(trigdx PUBLIC HAVE_AVX2)
|
||||||
|
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
|
||||||
|
"IntelLLVM")
|
||||||
|
target_compile_options(trigdx PUBLIC -xCORE-AVX2)
|
||||||
|
else()
|
||||||
|
target_compile_options(trigdx PUBLIC -mavx2)
|
||||||
|
endif()
|
||||||
|
elseif(HAVE_AVX)
|
||||||
|
target_compile_definitions(trigdx PUBLIC HAVE_AVX)
|
||||||
|
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
|
||||||
|
"IntelLLVM")
|
||||||
|
target_compile_options(trigdx PUBLIC -xAVX)
|
||||||
|
else()
|
||||||
|
target_compile_options(trigdx PUBLIC -mavx)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
|
target_include_directories(trigdx PUBLIC ${PROJECT_SOURCE_DIR}/include)
|
||||||
|
|
||||||
target_compile_options(trigdx PRIVATE -O3 -march=native)
|
if(HAVE_AVX)
|
||||||
|
target_sources(trigdx PRIVATE lookup_avx.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(TRIGDX_USE_MKL)
|
if(TRIGDX_USE_MKL)
|
||||||
find_package(MKL REQUIRED)
|
find_package(MKL REQUIRED)
|
||||||
@@ -20,7 +42,19 @@ if(TRIGDX_USE_GPU)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(TRIGDX_USE_XSIMD)
|
if(TRIGDX_USE_XSIMD)
|
||||||
find_package(xsimd REQUIRED)
|
# Requires XSIMD > 13 for architecture independent dispatching
|
||||||
|
find_package(xsimd 13 QUIET)
|
||||||
|
if(NOT TARGET xsimd)
|
||||||
|
FetchContent_Declare(
|
||||||
|
xsimd
|
||||||
|
GIT_REPOSITORY https://github.com/xtensor-stack/xsimd.git
|
||||||
|
GIT_TAG 13.2.0)
|
||||||
|
FetchContent_MakeAvailable(xsimd)
|
||||||
|
endif()
|
||||||
target_sources(trigdx PRIVATE lookup_xsimd.cpp)
|
target_sources(trigdx PRIVATE lookup_xsimd.cpp)
|
||||||
target_link_libraries(trigdx PRIVATE xsimd)
|
target_link_libraries(trigdx PRIVATE xsimd)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
target_include_directories(
|
||||||
|
trigdx INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
|
||||||
|
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>)
|
||||||
|
|||||||
84
src/gpu.cpp
84
src/gpu.cpp
@@ -10,75 +10,63 @@
|
|||||||
|
|
||||||
struct GPUBackend::Impl {
|
struct GPUBackend::Impl {
|
||||||
|
|
||||||
~Impl() {
|
void *allocate_memory(size_t bytes) const {
|
||||||
if (h_x) {
|
void *ptr;
|
||||||
cudaFreeHost(h_x);
|
cudaMallocHost(&ptr, bytes);
|
||||||
}
|
return ptr;
|
||||||
if (h_s) {
|
|
||||||
cudaFreeHost(h_s);
|
|
||||||
}
|
|
||||||
if (h_c) {
|
|
||||||
cudaFreeHost(h_c);
|
|
||||||
}
|
|
||||||
if (d_x) {
|
|
||||||
cudaFree(d_x);
|
|
||||||
}
|
|
||||||
if (d_s) {
|
|
||||||
cudaFree(d_s);
|
|
||||||
}
|
|
||||||
if (d_c) {
|
|
||||||
cudaFree(d_c);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void init(size_t n) {
|
void free_memory(void *ptr) const { cudaFreeHost(ptr); }
|
||||||
const size_t bytes = n * sizeof(float);
|
|
||||||
cudaMallocHost(&h_x, bytes);
|
|
||||||
cudaMallocHost(&h_s, bytes);
|
|
||||||
cudaMallocHost(&h_c, bytes);
|
|
||||||
cudaMalloc(&d_x, bytes);
|
|
||||||
cudaMalloc(&d_s, bytes);
|
|
||||||
cudaMalloc(&d_c, bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
void compute_sinf(size_t n, const float *x, float *s) const {
|
void compute_sinf(size_t n, const float *x, float *s) const {
|
||||||
const size_t bytes = n * sizeof(float);
|
const size_t bytes = n * sizeof(float);
|
||||||
std::memcpy(h_x, x, bytes);
|
float *d_x, *d_s;
|
||||||
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
|
cudaMalloc(&d_x, bytes);
|
||||||
launch_sincosf_kernel(d_x, d_s, d_c, n);
|
cudaMalloc(&d_s, bytes);
|
||||||
cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
|
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
|
||||||
std::memcpy(s, h_s, bytes);
|
launch_sinf_kernel(d_x, d_s, n);
|
||||||
|
cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
cudaFree(d_x);
|
||||||
|
cudaFree(d_s);
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute_cosf(size_t n, const float *x, float *c) const {
|
void compute_cosf(size_t n, const float *x, float *c) const {
|
||||||
const size_t bytes = n * sizeof(float);
|
const size_t bytes = n * sizeof(float);
|
||||||
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
|
float *d_x, *d_c;
|
||||||
launch_sincosf_kernel(d_x, d_s, d_c, n);
|
cudaMalloc(&d_x, bytes);
|
||||||
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
|
cudaMalloc(&d_c, bytes);
|
||||||
std::memcpy(c, h_c, bytes);
|
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
|
||||||
|
launch_cosf_kernel(d_x, d_c, n);
|
||||||
|
cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
cudaFree(d_x);
|
||||||
|
cudaFree(d_c);
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
|
void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
|
||||||
const size_t bytes = n * sizeof(float);
|
const size_t bytes = n * sizeof(float);
|
||||||
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
|
float *d_x, *d_s, *d_c;
|
||||||
|
cudaMalloc(&d_x, bytes);
|
||||||
|
cudaMalloc(&d_s, bytes);
|
||||||
|
cudaMalloc(&d_c, bytes);
|
||||||
|
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
|
||||||
launch_sincosf_kernel(d_x, d_s, d_c, n);
|
launch_sincosf_kernel(d_x, d_s, d_c, n);
|
||||||
cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
|
cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost);
|
||||||
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
|
cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
cudaFree(d_x);
|
||||||
|
cudaFree(d_s);
|
||||||
|
cudaFree(d_c);
|
||||||
}
|
}
|
||||||
|
|
||||||
float *h_x = nullptr;
|
|
||||||
float *h_s = nullptr;
|
|
||||||
float *h_c = nullptr;
|
|
||||||
float *d_x = nullptr;
|
|
||||||
float *d_s = nullptr;
|
|
||||||
float *d_c = nullptr;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
|
GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
|
||||||
|
|
||||||
GPUBackend::~GPUBackend() = default;
|
GPUBackend::~GPUBackend() = default;
|
||||||
|
|
||||||
void GPUBackend::init(size_t n) { impl->init(n); }
|
void *GPUBackend::allocate_memory(size_t bytes) const {
|
||||||
|
return impl->allocate_memory(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }
|
||||||
|
|
||||||
void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
|
void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
|
||||||
impl->compute_sinf(n, x, s);
|
impl->compute_sinf(n, x, s);
|
||||||
|
|||||||
@@ -6,6 +6,16 @@
|
|||||||
|
|
||||||
#include "trigdx/lookup_avx.hpp"
|
#include "trigdx/lookup_avx.hpp"
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX) && !defined(__AVX__)
|
||||||
|
static_assert(HAVE_AVX == 0, "__AVX__ should be defined when HAVE_AVX is "
|
||||||
|
"defined");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX2) && !defined(__AVX2__)
|
||||||
|
static_assert(HAVE_AVX2 == 0, "__AVX2__ should be defined when HAVE_AVX2 is "
|
||||||
|
"defined");
|
||||||
|
#endif
|
||||||
|
|
||||||
template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
|
template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
|
||||||
std::vector<float> lookup;
|
std::vector<float> lookup;
|
||||||
static constexpr std::size_t MASK = NR_SAMPLES - 1;
|
static constexpr std::size_t MASK = NR_SAMPLES - 1;
|
||||||
@@ -160,7 +170,6 @@ template <std::size_t NR_SAMPLES> struct LookupAVXBackend<NR_SAMPLES>::Impl {
|
|||||||
for (std::size_t i = 0; i < n; ++i) {
|
for (std::size_t i = 0; i < n; ++i) {
|
||||||
std::size_t idx = static_cast<std::size_t>(x[i] * SCALE) & MASK;
|
std::size_t idx = static_cast<std::size_t>(x[i] * SCALE) & MASK;
|
||||||
std::size_t idx_cos = (idx + NR_SAMPLES / 4) & MASK;
|
std::size_t idx_cos = (idx + NR_SAMPLES / 4) & MASK;
|
||||||
s[i] = lookup[idx];
|
|
||||||
c[i] = lookup[idx_cos];
|
c[i] = lookup[idx_cos];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -8,12 +8,20 @@
|
|||||||
template <std::size_t NR_SAMPLES> struct lookup_table {
|
template <std::size_t NR_SAMPLES> struct lookup_table {
|
||||||
static constexpr std::size_t MASK = NR_SAMPLES - 1;
|
static constexpr std::size_t MASK = NR_SAMPLES - 1;
|
||||||
static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI));
|
static constexpr float SCALE = NR_SAMPLES / (2.0f * float(M_PI));
|
||||||
lookup_table() : values{} {
|
static constexpr float PI_FRAC = 2.0f * M_PIf32 / NR_SAMPLES;
|
||||||
|
static constexpr float TERM1 = 1.0f; // 1
|
||||||
|
static constexpr float TERM2 = 0.5f; // 1/2!
|
||||||
|
static constexpr float TERM3 = 1.0f / 6.0f; // 1/3!
|
||||||
|
static constexpr float TERM4 = 1.0f / 24.0f; // 1/4!
|
||||||
|
|
||||||
|
constexpr lookup_table() : sin_values{}, cos_values{} {
|
||||||
for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) {
|
for (uint_fast32_t i = 0; i < NR_SAMPLES; i++) {
|
||||||
values[i] = sinf(i * (2.0f * float(M_PI) / NR_SAMPLES));
|
sin_values[i] = sinf(i * PI_FRAC);
|
||||||
|
cos_values[i] = cosf(i * PI_FRAC);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::array<float, NR_SAMPLES> values;
|
std::array<float, NR_SAMPLES> sin_values;
|
||||||
|
std::array<float, NR_SAMPLES> cos_values;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
|
template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
|
||||||
@@ -25,28 +33,56 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
|
|||||||
|
|
||||||
constexpr uint_fast32_t VL = b_type::size;
|
constexpr uint_fast32_t VL = b_type::size;
|
||||||
const uint_fast32_t VS = n - n % VL;
|
const uint_fast32_t VS = n - n % VL;
|
||||||
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
|
|
||||||
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
|
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
|
||||||
|
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
|
||||||
const m_type mask = m_type::broadcast(lookup_table_.MASK);
|
const m_type mask = m_type::broadcast(lookup_table_.MASK);
|
||||||
|
|
||||||
const m_type quarter_pi = m_type::broadcast(Q_PI);
|
const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
|
||||||
|
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
|
||||||
|
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
|
||||||
|
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
|
||||||
|
|
||||||
uint_fast32_t i;
|
uint_fast32_t i;
|
||||||
for (i = 0; i < VS; i += VL) {
|
for (i = 0; i < VS; i += VL) {
|
||||||
const b_type vx = b_type::load(a + i, Tag());
|
const b_type vx = b_type::load(a + i, Tag());
|
||||||
const b_type scaled = xsimd::mul(vx, scale);
|
const b_type scaled = xsimd::mul(vx, scale);
|
||||||
m_type idx = xsimd::to_int(scaled);
|
m_type idx = xsimd::to_int(scaled);
|
||||||
m_type idx_cos = xsimd::add(idx, quarter_pi);
|
const b_type f_idx = xsimd::to_float(idx);
|
||||||
idx_cos = xsimd::bitwise_and(idx_cos, mask);
|
idx = xsimd::bitwise_and(idx, mask);
|
||||||
const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
|
|
||||||
|
b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
|
||||||
|
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
|
||||||
|
|
||||||
|
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
|
||||||
|
const b_type dx2 = xsimd::mul(dx, dx);
|
||||||
|
const b_type dx3 = xsimd::mul(dx2, dx);
|
||||||
|
const b_type dx4 = xsimd::mul(dx2, dx);
|
||||||
|
const b_type t2 = xsimd::mul(dx2, term2);
|
||||||
|
const b_type t3 = xsimd::mul(dx3, term3);
|
||||||
|
const b_type t4 = xsimd::mul(dx4, term4);
|
||||||
|
|
||||||
|
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
|
||||||
|
|
||||||
|
const b_type sindx = xsimd::sub(dx, t3);
|
||||||
|
|
||||||
|
cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
|
||||||
|
|
||||||
cosv.store(c + i, Tag());
|
cosv.store(c + i, Tag());
|
||||||
}
|
}
|
||||||
for (; i < n; i++) {
|
for (; i < n; i++) {
|
||||||
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
|
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
|
||||||
lookup_table_.MASK;
|
|
||||||
std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
|
|
||||||
|
|
||||||
c[i] = lookup_table_.values[idx_cos];
|
std::size_t masked = idx & lookup_table_.MASK;
|
||||||
|
const float cosv = lookup_table_.cos_values[masked];
|
||||||
|
const float sinv = lookup_table_.sin_values[masked];
|
||||||
|
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
|
||||||
|
const float dx2 = dx * dx;
|
||||||
|
const float dx3 = dx2 * dx;
|
||||||
|
const float dx4 = dx3 * dx;
|
||||||
|
const float cosdx =
|
||||||
|
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
|
||||||
|
const float sindx = dx - lookup_table_.TERM3 * dx3;
|
||||||
|
c[i] = cosv * cosdx - sinv * sindx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lookup_table<NR_SAMPLES> lookup_table_;
|
lookup_table<NR_SAMPLES> lookup_table_;
|
||||||
@@ -61,25 +97,53 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
|
|||||||
|
|
||||||
constexpr uint_fast32_t VL = b_type::size;
|
constexpr uint_fast32_t VL = b_type::size;
|
||||||
const uint_fast32_t VS = n - n % VL;
|
const uint_fast32_t VS = n - n % VL;
|
||||||
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
|
|
||||||
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
|
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
|
||||||
|
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
|
||||||
const m_type mask = m_type::broadcast(lookup_table_.MASK);
|
const m_type mask = m_type::broadcast(lookup_table_.MASK);
|
||||||
|
|
||||||
const m_type quarter_pi = m_type::broadcast(Q_PI);
|
const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
|
||||||
|
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
|
||||||
|
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
|
||||||
|
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
|
||||||
|
|
||||||
uint_fast32_t i;
|
uint_fast32_t i;
|
||||||
for (i = 0; i < VS; i += VL) {
|
for (i = 0; i < VS; i += VL) {
|
||||||
const b_type vx = b_type::load(a + i, Tag());
|
const b_type vx = b_type::load(a + i, Tag());
|
||||||
const b_type scaled = xsimd::mul(vx, scale);
|
const b_type scaled = xsimd::mul(vx, scale);
|
||||||
m_type idx = xsimd::to_int(scaled);
|
m_type idx = xsimd::to_int(scaled);
|
||||||
idx = xsimd::bitwise_and(idx, mask);
|
b_type f_idx = xsimd::to_float(idx);
|
||||||
const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
|
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
|
||||||
|
const b_type dx2 = xsimd::mul(dx, dx);
|
||||||
|
const b_type dx3 = xsimd::mul(dx2, dx);
|
||||||
|
const b_type dx4 = xsimd::mul(dx2, dx);
|
||||||
|
const b_type t2 = xsimd::mul(dx2, term2);
|
||||||
|
const b_type t3 = xsimd::mul(dx3, term3);
|
||||||
|
const b_type t4 = xsimd::mul(dx4, term4);
|
||||||
|
|
||||||
|
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
|
||||||
|
const b_type sindx = xsimd::sub(dx, t3);
|
||||||
|
|
||||||
|
idx = xsimd::bitwise_and(idx, mask);
|
||||||
|
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
|
||||||
|
const b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
|
||||||
|
|
||||||
|
sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
|
||||||
sinv.store(s + i, Tag());
|
sinv.store(s + i, Tag());
|
||||||
}
|
}
|
||||||
for (; i < n; i++) {
|
for (; i < n; i++) {
|
||||||
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
|
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
|
||||||
lookup_table_.MASK;
|
std::size_t masked = idx & lookup_table_.MASK;
|
||||||
s[i] = lookup_table_.values[idx];
|
const float cosv = lookup_table_.cos_values[masked];
|
||||||
|
const float sinv = lookup_table_.sin_values[masked];
|
||||||
|
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
|
||||||
|
const float dx2 = dx * dx;
|
||||||
|
const float dx3 = dx2 * dx;
|
||||||
|
const float dx4 = dx3 * dx;
|
||||||
|
const float cosdx =
|
||||||
|
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
|
||||||
|
const float sindx = dx - lookup_table_.TERM3 * dx3;
|
||||||
|
|
||||||
|
s[i] = sinv * cosdx + cosv * sindx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lookup_table<NR_SAMPLES> lookup_table_;
|
lookup_table<NR_SAMPLES> lookup_table_;
|
||||||
@@ -94,31 +158,56 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
|
|||||||
|
|
||||||
constexpr uint_fast32_t VL = b_type::size;
|
constexpr uint_fast32_t VL = b_type::size;
|
||||||
const uint_fast32_t VS = n - n % VL;
|
const uint_fast32_t VS = n - n % VL;
|
||||||
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
|
|
||||||
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
|
const b_type scale = b_type::broadcast(lookup_table_.SCALE);
|
||||||
const m_type mask = m_type::broadcast(lookup_table_.MASK);
|
const m_type mask = m_type::broadcast(lookup_table_.MASK);
|
||||||
|
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
|
||||||
|
|
||||||
|
const b_type term1 = b_type::broadcast(lookup_table_.TERM1); // 1
|
||||||
|
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
|
||||||
|
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
|
||||||
|
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
|
||||||
|
|
||||||
const m_type quarter_pi = m_type::broadcast(Q_PI);
|
|
||||||
uint_fast32_t i;
|
uint_fast32_t i;
|
||||||
for (i = 0; i < VS; i += VL) {
|
for (i = 0; i < VS; i += VL) {
|
||||||
const b_type vx = b_type::load(a + i, Tag());
|
const b_type vx = b_type::load(a + i, Tag());
|
||||||
const b_type scaled = xsimd::mul(vx, scale);
|
const b_type scaled = xsimd::mul(vx, scale);
|
||||||
m_type idx = xsimd::to_int(scaled);
|
m_type idx = xsimd::to_int(scaled);
|
||||||
m_type idx_cos = xsimd::add(idx, quarter_pi);
|
b_type f_idx = xsimd::to_float(idx);
|
||||||
|
const b_type dx = xsimd::sub(vx, xsimd::mul(f_idx, pi_frac));
|
||||||
|
const b_type dx2 = xsimd::mul(dx, dx);
|
||||||
|
const b_type dx3 = xsimd::mul(dx2, dx);
|
||||||
|
const b_type dx4 = xsimd::mul(dx2, dx);
|
||||||
|
const b_type t2 = xsimd::mul(dx2, term2);
|
||||||
|
const b_type t3 = xsimd::mul(dx3, term3);
|
||||||
|
const b_type t4 = xsimd::mul(dx4, term4);
|
||||||
|
|
||||||
idx = xsimd::bitwise_and(idx, mask);
|
idx = xsimd::bitwise_and(idx, mask);
|
||||||
idx_cos = xsimd::bitwise_and(idx_cos, mask);
|
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);
|
||||||
const b_type sinv = b_type::gather(lookup_table_.values.data(), idx);
|
b_type cosv = b_type::gather(lookup_table_.cos_values.data(), idx);
|
||||||
const b_type cosv = b_type::gather(lookup_table_.values.data(), idx_cos);
|
|
||||||
|
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
|
||||||
|
const b_type sindx = xsimd::sub(dx, t3);
|
||||||
|
|
||||||
|
sinv = xsimd::add(xsimd::mul(cosv, sindx), xsimd::mul(sinv, cosdx));
|
||||||
|
cosv = xsimd::sub(xsimd::mul(cosv, cosdx), xsimd::mul(sinv, sindx));
|
||||||
|
|
||||||
sinv.store(s + i, Tag());
|
sinv.store(s + i, Tag());
|
||||||
cosv.store(c + i, Tag());
|
cosv.store(c + i, Tag());
|
||||||
}
|
}
|
||||||
for (; i < n; i++) {
|
for (; i < n; i++) {
|
||||||
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE) &
|
std::size_t idx = static_cast<std::size_t>(a[i] * lookup_table_.SCALE);
|
||||||
lookup_table_.MASK;
|
std::size_t masked = idx & lookup_table_.MASK;
|
||||||
std::size_t idx_cos = (idx + Q_PI) & lookup_table_.MASK;
|
const float cosv = lookup_table_.cos_values[masked];
|
||||||
s[i] = lookup_table_.values[idx];
|
const float sinv = lookup_table_.sin_values[masked];
|
||||||
c[i] = lookup_table_.values[idx_cos];
|
const float dx = a[i] - idx * lookup_table_.PI_FRAC;
|
||||||
|
const float dx2 = dx * dx;
|
||||||
|
const float dx3 = dx2 * dx;
|
||||||
|
const float dx4 = dx3 * dx;
|
||||||
|
const float cosdx =
|
||||||
|
1.0f - lookup_table_.TERM2 * dx2 + lookup_table_.TERM4 * dx4;
|
||||||
|
const float sindx = dx - lookup_table_.TERM3 * dx3;
|
||||||
|
s[i] = sinv * cosdx + cosv * sindx;
|
||||||
|
c[i] = cosv * cosdx - sinv * sindx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lookup_table<NR_SAMPLES> lookup_table_;
|
lookup_table<NR_SAMPLES> lookup_table_;
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
include(FetchContent)
|
|
||||||
|
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
catch2
|
catch2
|
||||||
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
|
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
|
||||||
@@ -9,30 +7,30 @@ FetchContent_MakeAvailable(catch2)
|
|||||||
# Lookup backend test
|
# Lookup backend test
|
||||||
add_executable(test_lookup test_lookup.cpp)
|
add_executable(test_lookup test_lookup.cpp)
|
||||||
target_link_libraries(test_lookup PRIVATE trigdx Catch2::Catch2WithMain)
|
target_link_libraries(test_lookup PRIVATE trigdx Catch2::Catch2WithMain)
|
||||||
|
add_test(NAME test_lookup COMMAND test_lookup)
|
||||||
|
|
||||||
# LookupAVX backend test
|
# LookupAVX backend test
|
||||||
add_executable(test_lookup_avx test_lookup_avx.cpp)
|
if(HAVE_AVX)
|
||||||
target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
|
add_executable(test_lookup_avx test_lookup_avx.cpp)
|
||||||
|
target_link_libraries(test_lookup_avx PRIVATE trigdx Catch2::Catch2WithMain)
|
||||||
|
add_test(NAME test_lookup_avx COMMAND test_lookup_avx)
|
||||||
|
endif()
|
||||||
|
|
||||||
# MKL backend test
|
# MKL backend test
|
||||||
if(TRIGDX_USE_MKL)
|
if(TRIGDX_USE_MKL)
|
||||||
add_executable(test_mkl test_mkl.cpp)
|
add_executable(test_mkl test_mkl.cpp)
|
||||||
target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
|
target_link_libraries(test_mkl PRIVATE trigdx Catch2::Catch2WithMain)
|
||||||
endif()
|
|
||||||
|
|
||||||
include(CTest)
|
|
||||||
add_test(NAME test_lookup COMMAND test_lookup)
|
|
||||||
|
|
||||||
if(TRIGDX_USE_MKL)
|
|
||||||
add_test(NAME test_mkl COMMAND test_mkl)
|
add_test(NAME test_mkl COMMAND test_mkl)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# GPU backend test
|
||||||
if(TRIGDX_USE_GPU)
|
if(TRIGDX_USE_GPU)
|
||||||
add_executable(test_gpu test_gpu.cpp)
|
add_executable(test_gpu test_gpu.cpp)
|
||||||
target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
|
target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
|
||||||
add_test(NAME test_gpu COMMAND test_gpu)
|
add_test(NAME test_gpu COMMAND test_gpu)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# XSIMD backend test
|
||||||
if(TRIGDX_USE_XSIMD)
|
if(TRIGDX_USE_XSIMD)
|
||||||
add_executable(test_lookup_xsimd test_lookup_xsimd.cpp)
|
add_executable(test_lookup_xsimd test_lookup_xsimd.cpp)
|
||||||
target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain)
|
target_link_libraries(test_lookup_xsimd PRIVATE trigdx Catch2::Catch2WithMain)
|
||||||
|
|||||||
@@ -3,8 +3,8 @@
|
|||||||
|
|
||||||
#include "test_utils.hpp"
|
#include "test_utils.hpp"
|
||||||
|
|
||||||
TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-1f); }
|
TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-6f); }
|
||||||
|
|
||||||
TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-1f); }
|
TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-6f); }
|
||||||
|
|
||||||
TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-1f); }
|
TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-6f); }
|
||||||
|
|||||||
@@ -4,16 +4,16 @@
|
|||||||
#include "test_utils.hpp"
|
#include "test_utils.hpp"
|
||||||
|
|
||||||
TEST_CASE("sincosf") {
|
TEST_CASE("sincosf") {
|
||||||
test_sincosf<LookupXSIMDBackend<16384>>(1e-2f);
|
test_sincosf<LookupXSIMDBackend<16384>>(1e-6f);
|
||||||
test_sincosf<LookupXSIMDBackend<32768>>(1e-2f);
|
test_sincosf<LookupXSIMDBackend<32768>>(1e-6f);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("sinf") {
|
TEST_CASE("sinf") {
|
||||||
test_sinf<LookupXSIMDBackend<16384>>(1e-2f);
|
test_sinf<LookupXSIMDBackend<16384>>(1e-6f);
|
||||||
test_sinf<LookupXSIMDBackend<32768>>(1e-2f);
|
test_sinf<LookupXSIMDBackend<32768>>(1e-6f);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("cosf") {
|
TEST_CASE("cosf") {
|
||||||
test_cosf<LookupXSIMDBackend<16384>>(1e-2f);
|
test_cosf<LookupXSIMDBackend<16384>>(1e-6f);
|
||||||
test_cosf<LookupXSIMDBackend<32768>>(1e-2f);
|
test_cosf<LookupXSIMDBackend<32768>>(1e-6f);
|
||||||
}
|
}
|
||||||
@@ -7,14 +7,17 @@
|
|||||||
#include <catch2/matchers/catch_matchers_floating_point.hpp>
|
#include <catch2/matchers/catch_matchers_floating_point.hpp>
|
||||||
#include <trigdx/reference.hpp>
|
#include <trigdx/reference.hpp>
|
||||||
|
|
||||||
const size_t N = 1e7;
|
const size_t N = 1234;
|
||||||
|
|
||||||
|
void init_x(std::vector<float> &x) {
|
||||||
|
for (size_t i = 0; i < x.size(); ++i) {
|
||||||
|
x[i] = (i % 360) * 0.0174533f; // degrees to radians
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Backend> inline void test_sinf(float tol) {
|
template <typename Backend> inline void test_sinf(float tol) {
|
||||||
std::vector<float> x(N), s_ref(N), s(N);
|
std::vector<float> x(N), s_ref(N), s(N);
|
||||||
|
init_x(x);
|
||||||
for (size_t i = 0; i < N; ++i) {
|
|
||||||
x[i] = float(i) * 0.01f;
|
|
||||||
}
|
|
||||||
|
|
||||||
ReferenceBackend ref;
|
ReferenceBackend ref;
|
||||||
Backend backend;
|
Backend backend;
|
||||||
@@ -30,10 +33,7 @@ template <typename Backend> inline void test_sinf(float tol) {
|
|||||||
|
|
||||||
template <typename Backend> inline void test_cosf(float tol) {
|
template <typename Backend> inline void test_cosf(float tol) {
|
||||||
std::vector<float> x(N), c_ref(N), c(N);
|
std::vector<float> x(N), c_ref(N), c(N);
|
||||||
|
init_x(x);
|
||||||
for (size_t i = 0; i < N; ++i) {
|
|
||||||
x[i] = float(i) * 0.01f;
|
|
||||||
}
|
|
||||||
|
|
||||||
ReferenceBackend ref;
|
ReferenceBackend ref;
|
||||||
Backend backend;
|
Backend backend;
|
||||||
@@ -49,10 +49,7 @@ template <typename Backend> inline void test_cosf(float tol) {
|
|||||||
|
|
||||||
template <typename Backend> inline void test_sincosf(float tol) {
|
template <typename Backend> inline void test_sincosf(float tol) {
|
||||||
std::vector<float> x(N), s_ref(N), c_ref(N), s(N), c(N);
|
std::vector<float> x(N), s_ref(N), c_ref(N), s(N), c(N);
|
||||||
|
init_x(x);
|
||||||
for (size_t i = 0; i < N; ++i) {
|
|
||||||
x[i] = float(i) * 0.01f;
|
|
||||||
}
|
|
||||||
|
|
||||||
ReferenceBackend ref;
|
ReferenceBackend ref;
|
||||||
Backend backend;
|
Backend backend;
|
||||||
|
|||||||
Reference in New Issue
Block a user