diff --git a/include/trigdx/gpu.hpp b/include/trigdx/gpu.hpp index 8fb4555..899c16f 100644 --- a/include/trigdx/gpu.hpp +++ b/include/trigdx/gpu.hpp @@ -16,6 +16,7 @@ public: void compute_cosf(size_t n, const float *x, float *c) const override; void compute_sincosf(size_t n, const float *x, float *s, float *c) const override; + void compute_expf(size_t n, const float *x, float *e) const override; private: struct Impl; diff --git a/include/trigdx/interface.hpp b/include/trigdx/interface.hpp index 9975486..bb960d6 100644 --- a/include/trigdx/interface.hpp +++ b/include/trigdx/interface.hpp @@ -19,4 +19,7 @@ public: // Compute sine and cosine for n elements virtual void compute_sincosf(size_t n, const float *x, float *s, float *c) const = 0; + + // Compute exponent for n elements + virtual void compute_expf(size_t n, const float *x, float *e) const = 0; }; diff --git a/include/trigdx/mkl.hpp b/include/trigdx/mkl.hpp index ee2709b..466ec60 100644 --- a/include/trigdx/mkl.hpp +++ b/include/trigdx/mkl.hpp @@ -10,4 +10,6 @@ public: void compute_sincosf(size_t n, const float *x, float *s, float *c) const override; + + void compute_expf(size_t n, const float *x, float *e) const override; }; diff --git a/include/trigdx/reference.hpp b/include/trigdx/reference.hpp index 0259083..0ab5f86 100644 --- a/include/trigdx/reference.hpp +++ b/include/trigdx/reference.hpp @@ -10,4 +10,6 @@ public: void compute_sincosf(size_t n, const float *x, float *s, float *c) const override; + + void compute_expf(size_t n, const float *x, float *e) const override; }; diff --git a/src/gpu.cpp b/src/gpu.cpp index b6be96f..79b8232 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -20,6 +20,9 @@ struct GPUBackend::Impl { if (h_c) { cudaFreeHost(h_c); } + if (h_e) { + cudaFreeHost(h_e); + } if (d_x) { cudaFree(d_x); } @@ -29,6 +32,9 @@ struct GPUBackend::Impl { if (d_c) { cudaFree(d_c); } + if (d_e) { + cudaFree(d_e); + } } void init(size_t n) { @@ -36,9 +42,11 @@ struct GPUBackend::Impl { cudaMallocHost(&h_x, bytes); cudaMallocHost(&h_s, bytes); cudaMallocHost(&h_c, bytes); + cudaMallocHost(&h_e, bytes); cudaMalloc(&d_x, bytes); cudaMalloc(&d_s, bytes); cudaMalloc(&d_c, bytes); + cudaMalloc(&d_e, bytes); } void compute_sinf(size_t n, const float *x, float *s) const { @@ -70,12 +78,23 @@ struct GPUBackend::Impl { std::memcpy(c, h_c, bytes); } + void compute_expf(size_t n, const float *x, float *e) const { + const size_t bytes = n * sizeof(float); + std::memcpy(h_x, x, bytes); + cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); + launch_expf_kernel(d_x, d_e, n); + cudaMemcpy(h_e, d_e, bytes, cudaMemcpyDeviceToHost); + std::memcpy(e, h_e, bytes); + } + float *h_x = nullptr; float *h_s = nullptr; float *h_c = nullptr; + float *h_e = nullptr; float *d_x = nullptr; float *d_s = nullptr; float *d_c = nullptr; + float *d_e = nullptr; }; GPUBackend::GPUBackend() : impl(std::make_unique()) {} @@ -95,4 +114,8 @@ void GPUBackend::compute_cosf(size_t n, const float *x, float *c) const { void GPUBackend::compute_sincosf(size_t n, const float *x, float *s, float *c) const { impl->compute_sincosf(n, x, s, c); -} \ No newline at end of file +} + +void GPUBackend::compute_expf(size_t n, const float *x, float *e) const { + impl->compute_expf(n, x, e); +} diff --git a/src/gpu/gpu.cu b/src/gpu/gpu.cu index a4e48e2..90ad0ab 100644 --- a/src/gpu/gpu.cu +++ b/src/gpu/gpu.cu @@ -31,6 +31,15 @@ __global__ void kernel_sincosf(const float *__restrict__ x, } } +__global__ void kernel_expf(const float *__restrict__ x, float *__restrict__ e, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + // e[idx] = __expf(x[idx]); + e[idx] = expf(x[idx]); + } +} + namespace { inline dim3 make_grid(size_t n, size_t threadsPerBlock = 256) { return dim3((n + threadsPerBlock - 1) / threadsPerBlock); @@ -54,3 +63,9 @@ void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, size_t n) { dim3 grid = make_grid(n, blocks.x); kernel_sincosf<<>>(d_x, d_s, d_c, n); } + +void launch_expf_kernel(const float *d_x, float *d_e, size_t n) { + dim3 blocks(256); + dim3 grid = make_grid(n, blocks.x); + kernel_expf<<>>(d_x, d_e, n); +} diff --git a/src/gpu/gpu.cuh b/src/gpu/gpu.cuh index 2d49a88..92623df 100644 --- a/src/gpu/gpu.cuh +++ b/src/gpu/gpu.cuh @@ -6,3 +6,4 @@ void launch_sinf_kernel(const float *d_x, float *d_s, size_t n); void launch_cosf_kernel(const float *d_x, float *d_c, size_t n); void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, std::size_t n); +void launch_expf_kernel(const float *d_x, float *d_e, size_t n); diff --git a/src/mkl.cpp b/src/mkl.cpp index 6ea0a59..3f36c07 100644 --- a/src/mkl.cpp +++ b/src/mkl.cpp @@ -14,3 +14,7 @@ void MKLBackend::compute_sincosf(size_t n, const float *x, float *s, float *c) const { vmsSinCos(static_cast(n), x, s, c, VML_HA); } + +void MKLBackend::compute_expf(size_t n, const float *x, float *e) const { + vmsExp(static_cast(n), x, e, VML_HA); +} diff --git a/src/reference.cpp b/src/reference.cpp index d8be5a1..4679ff8 100644 --- a/src/reference.cpp +++ b/src/reference.cpp @@ -21,3 +21,9 @@ void ReferenceBackend::compute_sincosf(size_t n, const float *x, float *s, c[i] = cosf(x[i]); } } + +void ReferenceBackend::compute_expf(size_t n, const float *x, float *e) const { + for (size_t i = 0; i < n; ++i) { + e[i] = expf(x[i]); + } +} diff --git a/tests/test_mkl.cpp b/tests/test_mkl.cpp index 664300d..9d3f197 100644 --- a/tests/test_mkl.cpp +++ b/tests/test_mkl.cpp @@ -8,3 +8,5 @@ TEST_CASE("sinf") { test_sinf(1e-6f); } TEST_CASE("cosf") { test_cosf(1e-6f); } TEST_CASE("sincosf") { test_sincosf(1e-6f); } + +TEST_CASE("expf") { test_expf(1e-6f); } \ No newline at end of file diff --git a/tests/test_utils.hpp b/tests/test_utils.hpp index ae1657a..549de34 100644 --- a/tests/test_utils.hpp +++ b/tests/test_utils.hpp @@ -63,3 +63,19 @@ template inline void test_sincosf(float tol) { REQUIRE_THAT(c[i], Catch::Matchers::WithinAbs(c_ref[i], tol)); } } + +template inline void test_expf(float tol) { + std::vector x(N), e_ref(N), e(N); + init_x(x); + + ReferenceBackend ref; + Backend backend; + backend.init(N); + + ref.compute_expf(N, x.data(), e_ref.data()); + backend.compute_expf(N, x.data(), e.data()); + + for (size_t i = 0; i < N; ++i) { + REQUIRE_THAT(e[i], Catch::Matchers::WithinAbs(e_ref[i], tol)); + } +}