1 Commits

Author SHA1 Message Date
Bram Veenboer
d7a7af5de9 TODO: first changes to add expf 2025-09-01 15:52:04 +02:00
19 changed files with 151 additions and 186 deletions

View File

@@ -7,5 +7,4 @@ repos:
rev: v0.6.13 rev: v0.6.13
hooks: hooks:
- id: cmake-format - id: cmake-format
- id: cmake-lint - id: cmake-lint
args: [--disabled-codes=C0301]

View File

@@ -12,11 +12,6 @@ option(TRIGDX_BUILD_TESTS "Build tests" ON)
option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON) option(TRIGDX_BUILD_BENCHMARKS "Build tests" ON)
option(TRIGDX_BUILD_PYTHON "Build Python interface" ON) option(TRIGDX_BUILD_PYTHON "Build Python interface" ON)
# Add compiler flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wall -Wnon-virtual-dtor -Wduplicated-branches -Wvla -Wpointer-arith -Wextra -Wno-unused-parameter"
)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
configure_file( configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/cmake/trigdx_config.hpp.in

View File

@@ -1,54 +0,0 @@
# TrigDx
Highperformance C++ library offering multiple implementations of transcendental trigonometric functions (e.g., sin, cos, tan and their variants), designed for numerical, signalprocessing, and realtime systems where trading a small loss of accuracy for significantly higher throughput on modern CPUs (scalar and SIMD) and NVIDIA GPUs is acceptable.
## Why TrigDx?
Many applications use the standard library implementations, which prioritise correctness but are not always optimal for throughput on vectorized or GPU hardware. TrigDx gives you multiple implementations so you can:
- Replace `std::sin` / `std::cos` calls with faster approximations when a small, bounded reduction in accuracy is acceptable.
- Use SIMD/vectorized implementations and compact lookup tables for high throughput lookups.
- Run massively parallel kernels that take advantage of a GPU's _Special Function Units_ (SFUs).
## Requirements
- A C++ compiler with at least C++17 support (GCC, Clang)
- CMake 3.15+
- Optional: NVIDIA CUDA Toolkit 11+ to build GPU kernels
- Optional: GoogleTest (for unit tests) and GoogleBenchmark (for microbenchmarks)
## Building
```bash
git clone https://github.com/astron-rd/TrigDx.git
cd TrigDx
mkdir build && cd build
# CPU-only:
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_XSIMD=ON ..
cmake --build . -j
# Enable CUDA (if available):
cmake -DCMAKE_BUILD_TYPE=Release -DTRIGDX_USE_GPU=ON ..
cmake --build . -j
# Run tests:
ctest --output-on-failure -j
```
Common CMake options:
- `TRIGDX_USE_GPU=ON/OFF` — build GPU support.
- `TRIGDX_BUILD_TESTS=ON/OFF` — build tests.
- `TRIGDX_BUILD_BENCHMARKS=ON/OFF` — build benchmarks.
- `TRIGDX_BUILD_PYTHON` — build Python interface.
## Contributing
- Fork → create a feature branch → open a PR.
- Include unit tests for correctnesssensitive changes and benchmark results for performance changes.
- Follow project style (clangformat) and run tests locally before submitting.
## Reporting issues
When opening an issue for incorrect results or performance regressions, please include:
- Platform and CPU/GPU model.
- Compiler and version with exact compile flags.
- Small reproducer (input data and the TrigDx implementation used).
## License
See the LICENSE file in the repository for licensing details.

View File

@@ -2,14 +2,13 @@
#include <chrono> #include <chrono>
#include <cmath> #include <cmath>
#include <stdexcept>
#include <string> #include <string>
#include <vector> #include <vector>
#include <benchmark/benchmark.h> #include <benchmark/benchmark.h>
void init_x(float *x, size_t n) { void init_x(std::vector<float> &x) {
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < x.size(); ++i) {
x[i] = (i % 360) * 0.0174533f; // degrees to radians x[i] = (i % 360) * 0.0174533f; // degrees to radians
} }
} }
@@ -17,31 +16,24 @@ void init_x(float *x, size_t n) {
template <typename Backend> template <typename Backend>
static void benchmark_sinf(benchmark::State &state) { static void benchmark_sinf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0)); const size_t N = static_cast<size_t>(state.range(0));
std::vector<float> x(N), s(N);
init_x(x);
Backend backend; Backend backend;
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.init(N); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] = state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start) std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() / .count() /
1.e3; 1.e3;
init_x(x, N);
for (auto _ : state) { for (auto _ : state) {
backend.compute_sinf(N, x, s); backend.compute_sinf(N, x.data(), s.data());
benchmark::DoNotOptimize(s); benchmark::DoNotOptimize(s);
} }
backend.free_memory(x);
backend.free_memory(s);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N)); static_cast<int64_t>(N));
} }
@@ -49,35 +41,24 @@ static void benchmark_sinf(benchmark::State &state) {
template <typename Backend> template <typename Backend>
static void benchmark_cosf(benchmark::State &state) { static void benchmark_cosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0)); const size_t N = static_cast<size_t>(state.range(0));
std::vector<float> x(N), c(N);
init_x(x);
Backend backend; Backend backend;
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.init(N); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
if (!x || !c) {
throw std::runtime_error("Buffer allocation failed");
}
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] = state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start) std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() / .count() /
1.e3; 1.e3;
init_x(x, N);
for (auto _ : state) { for (auto _ : state) {
backend.compute_cosf(N, x, c); backend.compute_cosf(N, x.data(), c.data());
benchmark::DoNotOptimize(c); benchmark::DoNotOptimize(c);
} }
backend.free_memory(x);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N)); static_cast<int64_t>(N));
} }
@@ -85,38 +66,25 @@ static void benchmark_cosf(benchmark::State &state) {
template <typename Backend> template <typename Backend>
static void benchmark_sincosf(benchmark::State &state) { static void benchmark_sincosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0)); const size_t N = static_cast<size_t>(state.range(0));
std::vector<float> x(N), s(N), c(N);
init_x(x);
Backend backend; Backend backend;
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.init(N); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
if (!x || !s || !c) {
throw std::runtime_error("Buffer allocation failed");
}
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] = state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start) std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() / .count() /
1.e3; 1.e3;
init_x(x, N);
for (auto _ : state) { for (auto _ : state) {
backend.compute_sincosf(N, x, s, c); backend.compute_sincosf(N, x.data(), s.data(), c.data());
benchmark::DoNotOptimize(s); benchmark::DoNotOptimize(s);
benchmark::DoNotOptimize(c); benchmark::DoNotOptimize(c);
} }
backend.free_memory(x);
backend.free_memory(s);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N)); static_cast<int64_t>(N));
} }

View File

@@ -11,12 +11,12 @@ public:
GPUBackend(); GPUBackend();
~GPUBackend() override; ~GPUBackend() override;
void *allocate_memory(size_t bytes) const override; void init(size_t n = 0) override;
void free_memory(void *ptr) const override;
void compute_sinf(size_t n, const float *x, float *s) const override; void compute_sinf(size_t n, const float *x, float *s) const override;
void compute_cosf(size_t n, const float *x, float *c) const override; void compute_cosf(size_t n, const float *x, float *c) const override;
void compute_sincosf(size_t n, const float *x, float *s, void compute_sincosf(size_t n, const float *x, float *s,
float *c) const override; float *c) const override;
void compute_expf(size_t n, const float *x, float *e) const override;
private: private:
struct Impl; struct Impl;

View File

@@ -1,8 +1,6 @@
#pragma once #pragma once
#include <cstddef> #include <cstddef>
#include <cstdint>
#include <cstdlib>
// Base interface for all math backends // Base interface for all math backends
class Backend { class Backend {
@@ -12,12 +10,6 @@ public:
// Optional initialization // Optional initialization
virtual void init(size_t n = 0) {} virtual void init(size_t n = 0) {}
virtual void *allocate_memory(size_t bytes) const {
return static_cast<void *>(new uint8_t[bytes]);
};
virtual void free_memory(void *ptr) const { std::free(ptr); };
// Compute sine for n elements // Compute sine for n elements
virtual void compute_sinf(size_t n, const float *x, float *s) const = 0; virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;
@@ -27,4 +19,7 @@ public:
// Compute sine and cosine for n elements // Compute sine and cosine for n elements
virtual void compute_sincosf(size_t n, const float *x, float *s, virtual void compute_sincosf(size_t n, const float *x, float *s,
float *c) const = 0; float *c) const = 0;
// Compute exponent for n elements
virtual void compute_expf(size_t n, const float *x, float *e) const = 0;
}; };

View File

@@ -10,4 +10,6 @@ public:
void compute_sincosf(size_t n, const float *x, float *s, void compute_sincosf(size_t n, const float *x, float *s,
float *c) const override; float *c) const override;
void compute_expf(size_t n, const float *x, float *e) const override;
}; };

View File

@@ -10,4 +10,6 @@ public:
void compute_sincosf(size_t n, const float *x, float *s, void compute_sincosf(size_t n, const float *x, float *s,
float *c) const override; float *c) const override;
void compute_expf(size_t n, const float *x, float *e) const override;
}; };

View File

@@ -8,16 +8,5 @@ if(NOT pybind11_FOUND)
FetchContent_MakeAvailable(pybind11) FetchContent_MakeAvailable(pybind11)
endif() endif()
# Needed to set ${Python_VERSION_MAJOR} and ${Python_VERSION_MINOR}
find_package(Python REQUIRED)
pybind11_add_module(pytrigdx bindings.cpp) pybind11_add_module(pytrigdx bindings.cpp)
target_link_libraries(pytrigdx PRIVATE trigdx) target_link_libraries(pytrigdx PRIVATE trigdx)
set_target_properties(pytrigdx PROPERTIES OUTPUT_NAME "trigdx")
set(PYTHON_SITE_PACKAGES
"${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/trigdx"
)
install(TARGETS pytrigdx DESTINATION ${PYTHON_SITE_PACKAGES})
install(FILES __init__.py DESTINATION ${PYTHON_SITE_PACKAGES})

View File

@@ -1,16 +0,0 @@
from .trigdx import Reference, Lookup16K, Lookup32K, LookupAVX16K, LookupAVX32K
try:
from .trigdx import MKL
except ImportError:
pass
try:
from .trigdx import GPU
except ImportError:
pass
try:
from .trigdx import LookupXSIMD16K, LookupXSIMD32K
except ImportError:
pass

View File

@@ -72,9 +72,7 @@ void bind_backend(py::module &m, const char *name) {
.def("compute_sincosf", &compute_sincos<float>); .def("compute_sincosf", &compute_sincos<float>);
} }
PYBIND11_MODULE(trigdx, m) { PYBIND11_MODULE(pytrigdx, m) {
m.doc() = "TrigDx python bindings";
py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend") py::class_<Backend, std::shared_ptr<Backend>>(m, "Backend")
.def("init", &Backend::init); .def("init", &Backend::init);
@@ -93,4 +91,4 @@ PYBIND11_MODULE(trigdx, m) {
bind_backend<LookupXSIMDBackend<16384>>(m, "LookupXSIMD16K"); bind_backend<LookupXSIMDBackend<16384>>(m, "LookupXSIMD16K");
bind_backend<LookupXSIMDBackend<32768>>(m, "LookupXSIMD32K"); bind_backend<LookupXSIMDBackend<32768>>(m, "LookupXSIMD32K");
#endif #endif
} }

View File

@@ -10,63 +10,98 @@
struct GPUBackend::Impl { struct GPUBackend::Impl {
void *allocate_memory(size_t bytes) const { ~Impl() {
void *ptr; if (h_x) {
cudaMallocHost(&ptr, bytes); cudaFreeHost(h_x);
return ptr; }
if (h_s) {
cudaFreeHost(h_s);
}
if (h_c) {
cudaFreeHost(h_c);
}
if (h_e) {
cudaFreeHost(h_e);
}
if (d_x) {
cudaFree(d_x);
}
if (d_s) {
cudaFree(d_s);
}
if (d_c) {
cudaFree(d_c);
}
if (d_e) {
cudaFree(d_e);
}
} }
void free_memory(void *ptr) const { cudaFreeHost(ptr); } void init(size_t n) {
const size_t bytes = n * sizeof(float);
cudaMallocHost(&h_x, bytes);
cudaMallocHost(&h_s, bytes);
cudaMallocHost(&h_c, bytes);
cudaMallocHost(&h_e, bytes);
cudaMalloc(&d_x, bytes);
cudaMalloc(&d_s, bytes);
cudaMalloc(&d_c, bytes);
cudaMalloc(&d_e, bytes);
}
void compute_sinf(size_t n, const float *x, float *s) const { void compute_sinf(size_t n, const float *x, float *s) const {
const size_t bytes = n * sizeof(float); const size_t bytes = n * sizeof(float);
float *d_x, *d_s; std::memcpy(h_x, x, bytes);
cudaMalloc(&d_x, bytes); cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
cudaMalloc(&d_s, bytes);
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_sinf_kernel(d_x, d_s, n); launch_sinf_kernel(d_x, d_s, n);
cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost); cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x); std::memcpy(s, h_s, bytes);
cudaFree(d_s);
} }
void compute_cosf(size_t n, const float *x, float *c) const { void compute_cosf(size_t n, const float *x, float *c) const {
const size_t bytes = n * sizeof(float); const size_t bytes = n * sizeof(float);
float *d_x, *d_c; std::memcpy(h_x, x, bytes);
cudaMalloc(&d_x, bytes); cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
cudaMalloc(&d_c, bytes);
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_cosf_kernel(d_x, d_c, n); launch_cosf_kernel(d_x, d_c, n);
cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost); cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x); std::memcpy(c, h_c, bytes);
cudaFree(d_c);
} }
void compute_sincosf(size_t n, const float *x, float *s, float *c) const { void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
const size_t bytes = n * sizeof(float); const size_t bytes = n * sizeof(float);
float *d_x, *d_s, *d_c; std::memcpy(h_x, x, bytes);
cudaMalloc(&d_x, bytes); cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
cudaMalloc(&d_s, bytes);
cudaMalloc(&d_c, bytes);
cudaMemcpy(d_x, x, bytes, cudaMemcpyHostToDevice);
launch_sincosf_kernel(d_x, d_s, d_c, n); launch_sincosf_kernel(d_x, d_s, d_c, n);
cudaMemcpy(s, d_s, bytes, cudaMemcpyDeviceToHost); cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost); cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_x); std::memcpy(s, h_s, bytes);
cudaFree(d_s); std::memcpy(c, h_c, bytes);
cudaFree(d_c);
} }
void compute_expf(size_t n, const float *x, float *e) const {
const size_t bytes = n * sizeof(float);
std::memcpy(h_x, x, bytes);
cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
launch_expf_kernel(d_x, d_e, n);
cudaMemcpy(h_e, d_e, bytes, cudaMemcpyDeviceToHost);
std::memcpy(e, h_e, bytes);
}
float *h_x = nullptr;
float *h_s = nullptr;
float *h_c = nullptr;
float *h_e = nullptr;
float *d_x = nullptr;
float *d_s = nullptr;
float *d_c = nullptr;
float *d_e = nullptr;
}; };
GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {} GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
GPUBackend::~GPUBackend() = default; GPUBackend::~GPUBackend() = default;
void *GPUBackend::allocate_memory(size_t bytes) const { void GPUBackend::init(size_t n) { impl->init(n); }
return impl->allocate_memory(bytes);
}
void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }
void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const { void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
impl->compute_sinf(n, x, s); impl->compute_sinf(n, x, s);
@@ -79,4 +114,8 @@ void GPUBackend::compute_cosf(size_t n, const float *x, float *c) const {
void GPUBackend::compute_sincosf(size_t n, const float *x, float *s, void GPUBackend::compute_sincosf(size_t n, const float *x, float *s,
float *c) const { float *c) const {
impl->compute_sincosf(n, x, s, c); impl->compute_sincosf(n, x, s, c);
} }
void GPUBackend::compute_expf(size_t n, const float *x, float *e) const {
impl->compute_expf(n, x, e);
}

View File

@@ -31,6 +31,15 @@ __global__ void kernel_sincosf(const float *__restrict__ x,
} }
} }
__global__ void kernel_expf(const float *__restrict__ x, float *__restrict__ e,
size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
// e[idx] = __expf(x[idx]);
e[idx] = expf(x[idx]);
}
}
namespace { namespace {
inline dim3 make_grid(size_t n, size_t threadsPerBlock = 256) { inline dim3 make_grid(size_t n, size_t threadsPerBlock = 256) {
return dim3((n + threadsPerBlock - 1) / threadsPerBlock); return dim3((n + threadsPerBlock - 1) / threadsPerBlock);
@@ -54,3 +63,9 @@ void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, size_t n) {
dim3 grid = make_grid(n, blocks.x); dim3 grid = make_grid(n, blocks.x);
kernel_sincosf<<<grid, blocks>>>(d_x, d_s, d_c, n); kernel_sincosf<<<grid, blocks>>>(d_x, d_s, d_c, n);
} }
void launch_expf_kernel(const float *d_x, float *d_e, size_t n) {
dim3 blocks(256);
dim3 grid = make_grid(n, blocks.x);
kernel_expf<<<grid, blocks>>>(d_x, d_e, n);
}

View File

@@ -6,3 +6,4 @@ void launch_sinf_kernel(const float *d_x, float *d_s, size_t n);
void launch_cosf_kernel(const float *d_x, float *d_c, size_t n); void launch_cosf_kernel(const float *d_x, float *d_c, size_t n);
void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c,
std::size_t n); std::size_t n);
void launch_expf_kernel(const float *d_x, float *d_e, size_t n);

View File

@@ -20,8 +20,8 @@ template <std::size_t NR_SAMPLES> struct lookup_table {
cos_values[i] = cosf(i * PI_FRAC); cos_values[i] = cosf(i * PI_FRAC);
} }
} }
std::array<float, NR_SAMPLES> sin_values;
std::array<float, NR_SAMPLES> cos_values; std::array<float, NR_SAMPLES> cos_values;
std::array<float, NR_SAMPLES> sin_values;
}; };
template <std::size_t NR_SAMPLES> struct cosf_dispatcher { template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
@@ -33,6 +33,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC); const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
@@ -41,7 +42,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2! const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3! const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4! const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
@@ -59,7 +60,7 @@ template <std::size_t NR_SAMPLES> struct cosf_dispatcher {
const b_type dx4 = xsimd::mul(dx2, dx); const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2); const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3); const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4); const b_type t4 = xsimd::mul(dx4, term3);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4); const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
@@ -97,6 +98,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC); const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
@@ -105,7 +107,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2! const b_type term2 = b_type::broadcast(lookup_table_.TERM2); // 1/2!
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3! const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4! const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
@@ -118,7 +120,7 @@ template <std::size_t NR_SAMPLES> struct sinf_dispatcher {
const b_type dx4 = xsimd::mul(dx2, dx); const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2); const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3); const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4); const b_type t4 = xsimd::mul(dx4, term3);
const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4); const b_type cosdx = xsimd::add(xsimd::sub(term1, t2), t4);
const b_type sindx = xsimd::sub(dx, t3); const b_type sindx = xsimd::sub(dx, t3);
@@ -158,6 +160,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
constexpr uint_fast32_t VL = b_type::size; constexpr uint_fast32_t VL = b_type::size;
const uint_fast32_t VS = n - n % VL; const uint_fast32_t VS = n - n % VL;
const uint_fast32_t Q_PI = NR_SAMPLES / 4U;
const b_type scale = b_type::broadcast(lookup_table_.SCALE); const b_type scale = b_type::broadcast(lookup_table_.SCALE);
const m_type mask = m_type::broadcast(lookup_table_.MASK); const m_type mask = m_type::broadcast(lookup_table_.MASK);
const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC); const b_type pi_frac = b_type::broadcast(lookup_table_.PI_FRAC);
@@ -167,6 +170,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3! const b_type term3 = b_type::broadcast(lookup_table_.TERM3); // 1/3!
const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4! const b_type term4 = b_type::broadcast(lookup_table_.TERM4); // 1/4!
const m_type quarter_pi = m_type::broadcast(Q_PI);
uint_fast32_t i; uint_fast32_t i;
for (i = 0; i < VS; i += VL) { for (i = 0; i < VS; i += VL) {
const b_type vx = b_type::load(a + i, Tag()); const b_type vx = b_type::load(a + i, Tag());
@@ -179,7 +183,7 @@ template <std::size_t NR_SAMPLES> struct sin_cosf_dispatcher {
const b_type dx4 = xsimd::mul(dx2, dx); const b_type dx4 = xsimd::mul(dx2, dx);
const b_type t2 = xsimd::mul(dx2, term2); const b_type t2 = xsimd::mul(dx2, term2);
const b_type t3 = xsimd::mul(dx3, term3); const b_type t3 = xsimd::mul(dx3, term3);
const b_type t4 = xsimd::mul(dx4, term4); const b_type t4 = xsimd::mul(dx4, term3);
idx = xsimd::bitwise_and(idx, mask); idx = xsimd::bitwise_and(idx, mask);
b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx); b_type sinv = b_type::gather(lookup_table_.sin_values.data(), idx);

View File

@@ -14,3 +14,7 @@ void MKLBackend::compute_sincosf(size_t n, const float *x, float *s,
float *c) const { float *c) const {
vmsSinCos(static_cast<MKL_INT>(n), x, s, c, VML_HA); vmsSinCos(static_cast<MKL_INT>(n), x, s, c, VML_HA);
} }
void MKLBackend::compute_expf(size_t n, const float *x, float *e) const {
vmsExp(static_cast<MKL_INT>(n), x, e, VML_HA);
}

View File

@@ -21,3 +21,9 @@ void ReferenceBackend::compute_sincosf(size_t n, const float *x, float *s,
c[i] = cosf(x[i]); c[i] = cosf(x[i]);
} }
} }
void ReferenceBackend::compute_expf(size_t n, const float *x, float *e) const {
for (size_t i = 0; i < n; ++i) {
e[i] = expf(x[i]);
}
}

View File

@@ -8,3 +8,5 @@ TEST_CASE("sinf") { test_sinf<MKLBackend>(1e-6f); }
TEST_CASE("cosf") { test_cosf<MKLBackend>(1e-6f); } TEST_CASE("cosf") { test_cosf<MKLBackend>(1e-6f); }
TEST_CASE("sincosf") { test_sincosf<MKLBackend>(1e-6f); } TEST_CASE("sincosf") { test_sincosf<MKLBackend>(1e-6f); }
TEST_CASE("expf") { test_expf<MKLBackend>(1e-6f); }

View File

@@ -63,3 +63,19 @@ template <typename Backend> inline void test_sincosf(float tol) {
REQUIRE_THAT(c[i], Catch::Matchers::WithinAbs(c_ref[i], tol)); REQUIRE_THAT(c[i], Catch::Matchers::WithinAbs(c_ref[i], tol));
} }
} }
template <typename Backend> inline void test_expf(float tol) {
std::vector<float> x(N), e_ref(N), e(N);
init_x(x);
ReferenceBackend ref;
Backend backend;
backend.init(N);
ref.compute_expf(N, x.data(), e_ref.data());
backend.compute_expf(N, x.data(), e.data());
for (size_t i = 0; i < N; ++i) {
REQUIRE_THAT(e[i], Catch::Matchers::WithinAbs(e_ref[i], tol));
}
}