diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c9c22f..46c8e61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) option(USE_MKL "Enable Intel MKL backend" OFF) +option(USE_GPU "Enable GPU backend" OFF) include_directories(${PROJECT_SOURCE_DIR}/include) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index df44091..266c822 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -11,3 +11,8 @@ if(USE_MKL) add_executable(benchmark_mkl benchmark_mkl.cpp) target_link_libraries(benchmark_mkl PRIVATE trigdx) endif() + +if(USE_GPU) + add_executable(benchmark_gpu benchmark_gpu.cpp) + target_link_libraries(benchmark_gpu PRIVATE trigdx gpu) +endif() diff --git a/benchmarks/benchmark_gpu.cpp b/benchmarks/benchmark_gpu.cpp new file mode 100644 index 0000000..8595760 --- /dev/null +++ b/benchmarks/benchmark_gpu.cpp @@ -0,0 +1,9 @@ +#include + +#include "benchmark_utils.hpp" + +int main() { + benchmark_sinf(); + benchmark_cosf(); + benchmark_sincosf(); +} diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp index eb09b92..e354c3f 100644 --- a/benchmarks/benchmark_utils.hpp +++ b/benchmarks/benchmark_utils.hpp @@ -25,7 +25,7 @@ template inline void benchmark_sinf() { x[i] = (i % 360) * 0.0174533f; // degrees to radians Backend backend; - backend.init(); + backend.init(N); auto start = std::chrono::high_resolution_clock::now(); backend.compute_sinf(N, x.data(), s.data()); @@ -44,7 +44,7 @@ template inline void benchmark_cosf() { x[i] = (i % 360) * 0.0174533f; // degrees to radians Backend backend; - backend.init(); + backend.init(N); auto start = std::chrono::high_resolution_clock::now(); backend.compute_cosf(N, x.data(), c.data()); @@ -63,7 +63,7 @@ template inline void benchmark_sincosf() { x[i] = (i % 360) * 0.0174533f; // degrees to radians Backend backend; - backend.init(); + backend.init(N); auto start = std::chrono::high_resolution_clock::now(); backend.compute_sincosf(N, x.data(), s.data(), c.data()); diff --git a/include/trigdx/gpu.hpp b/include/trigdx/gpu.hpp new file mode 100644 index 0000000..8fb4555 --- /dev/null +++ b/include/trigdx/gpu.hpp @@ -0,0 +1,23 @@ + +#pragma once + +#include +#include + +#include "interface.hpp" + +class GPUBackend : public Backend { +public: + GPUBackend(); + ~GPUBackend() override; + + void init(size_t n = 0) override; + void compute_sinf(size_t n, const float *x, float *s) const override; + void compute_cosf(size_t n, const float *x, float *c) const override; + void compute_sincosf(size_t n, const float *x, float *s, + float *c) const override; + +private: + struct Impl; + std::unique_ptr impl; +}; diff --git a/include/trigdx/interface.hpp b/include/trigdx/interface.hpp index 8b57348..9975486 100644 --- a/include/trigdx/interface.hpp +++ b/include/trigdx/interface.hpp @@ -8,7 +8,7 @@ public: virtual ~Backend() = default; // Optional initialization - virtual void init() {} + virtual void init(size_t n = 0) {} // Compute sine for n elements virtual void compute_sinf(size_t n, const float *x, float *s) const = 0; diff --git a/include/trigdx/lookup.hpp b/include/trigdx/lookup.hpp index feac9b7..3ac2082 100644 --- a/include/trigdx/lookup.hpp +++ b/include/trigdx/lookup.hpp @@ -10,7 +10,7 @@ public: LookupBackend(); ~LookupBackend() override; - void init() override; + void init(size_t n = 0) override; void compute_sinf(size_t n, const float *x, float *s) const override; void compute_cosf(size_t n, const float *x, float *c) const override; void compute_sincosf(size_t n, const float *x, float *s, diff --git a/include/trigdx/lookup_avx.hpp b/include/trigdx/lookup_avx.hpp index 228a7ab..0ba2a95 100644 --- a/include/trigdx/lookup_avx.hpp +++ b/include/trigdx/lookup_avx.hpp @@ -10,7 +10,7 @@ public: LookupAVXBackend(); ~LookupAVXBackend() override; - void init() override; + void init(size_t n = 0) override; void compute_sinf(std::size_t n, const float *x, float *s) const override; void compute_cosf(std::size_t n, const float *x, float *c) const override; void compute_sincosf(std::size_t n, const float *x, float *s, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c371642..f26d1d1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,3 +9,12 @@ if(USE_MKL) target_sources(trigdx PRIVATE mkl.cpp) target_link_libraries(trigdx PRIVATE MKL::MKL) endif() + +if(USE_GPU) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + add_library(gpu SHARED gpu/gpu.cu) + target_sources(trigdx PRIVATE gpu.cpp) + target_link_libraries(trigdx PRIVATE CUDA::cudart) + target_link_libraries(trigdx PRIVATE gpu) +endif() diff --git a/src/gpu.cpp b/src/gpu.cpp new file mode 100644 index 0000000..38ef92b --- /dev/null +++ b/src/gpu.cpp @@ -0,0 +1,94 @@ +#include +#include +#include +#include + +#include + +#include "gpu/gpu.cuh" +#include "trigdx/gpu.hpp" + +struct GPUBackend::Impl { + + ~Impl() { + if (h_x) { + cudaFreeHost(h_x); + } + if (h_s) { + cudaFreeHost(h_s); + } + if (h_c) { + cudaFreeHost(h_c); + } + if (d_x) { + cudaFree(d_x); + } + if (d_s) { + cudaFree(d_s); + } + if (d_c) { + cudaFree(d_c); + } + } + + void init(size_t n) { + const size_t bytes = n * sizeof(float); + cudaMallocHost(&h_x, bytes); + cudaMallocHost(&h_s, bytes); + cudaMallocHost(&h_c, bytes); + cudaMalloc(&d_x, bytes); + cudaMalloc(&d_s, bytes); + cudaMalloc(&d_c, bytes); + } + + void compute_sinf(size_t n, const float *x, float *s) const { + const size_t bytes = n * sizeof(float); + std::memcpy(h_x, x, bytes); + cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); + launch_sincosf_kernel(d_x, d_s, d_c, n); + cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost); + std::memcpy(s, h_s, bytes); + } + + void compute_cosf(size_t n, const float *x, float *c) const { + const size_t bytes = n * sizeof(float); + cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); + launch_sincosf_kernel(d_x, d_s, d_c, n); + cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost); + std::memcpy(c, h_c, bytes); + } + + void compute_sincosf(size_t n, const float *x, float *s, float *c) const { + const size_t bytes = n * sizeof(float); + cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); + launch_sincosf_kernel(d_x, d_s, d_c, n); + cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost); + cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost); + } + + float *h_x = nullptr; + float *h_s = nullptr; + float *h_c = nullptr; + float *d_x = nullptr; + float *d_s = nullptr; + float *d_c = nullptr; +}; + +GPUBackend::GPUBackend() : impl(std::make_unique()) {} + +GPUBackend::~GPUBackend() = default; + +void GPUBackend::init(size_t n) { impl->init(n); } + +void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const { + impl->compute_sinf(n, x, s); +} + +void GPUBackend::compute_cosf(size_t n, const float *x, float *c) const { + impl->compute_cosf(n, x, c); +} + +void GPUBackend::compute_sincosf(size_t n, const float *x, float *s, + float *c) const { + impl->compute_sincosf(n, x, s, c); +} \ No newline at end of file diff --git a/src/gpu/gpu.cu b/src/gpu/gpu.cu new file mode 100644 index 0000000..a4e48e2 --- /dev/null +++ b/src/gpu/gpu.cu @@ -0,0 +1,56 @@ +#include + +#include "gpu.cuh" + +__global__ void kernel_sinf(const float *__restrict__ x, float *__restrict__ s, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + // s[idx] = __sinf(x[idx]); + s[idx] = sinf(x[idx]); + } +} + +__global__ void kernel_cosf(const float *__restrict__ x, float *__restrict__ c, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + // c[idx] = __cosf(x[idx]); + c[idx] = cosf(x[idx]); + } +} + +__global__ void kernel_sincosf(const float *__restrict__ x, + float *__restrict__ s, float *__restrict__ c, + size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + // __sincosf(x[idx], &s[idx], &c[idx]); + s[idx] = sinf(x[idx]); + c[idx] = cosf(x[idx]); + } +} + +namespace { +inline dim3 make_grid(size_t n, size_t threadsPerBlock = 256) { + return dim3((n + threadsPerBlock - 1) / threadsPerBlock); +} +} // namespace + +void launch_sinf_kernel(const float *d_x, float *d_s, size_t n) { + dim3 blocks(256); + dim3 grid = make_grid(n, blocks.x); + kernel_sinf<<>>(d_x, d_s, n); +} + +void launch_cosf_kernel(const float *d_x, float *d_c, size_t n) { + dim3 blocks(256); + dim3 grid = make_grid(n, blocks.x); + kernel_cosf<<>>(d_x, d_c, n); +} + +void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, size_t n) { + dim3 blocks(256); + dim3 grid = make_grid(n, blocks.x); + kernel_sincosf<<>>(d_x, d_s, d_c, n); +} diff --git a/src/gpu/gpu.cuh b/src/gpu/gpu.cuh new file mode 100644 index 0000000..2d49a88 --- /dev/null +++ b/src/gpu/gpu.cuh @@ -0,0 +1,8 @@ +#pragma once + +#include + +void launch_sinf_kernel(const float *d_x, float *d_s, size_t n); +void launch_cosf_kernel(const float *d_x, float *d_c, size_t n); +void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, + std::size_t n); diff --git a/src/lookup.cpp b/src/lookup.cpp index 0c40a08..607be19 100644 --- a/src/lookup.cpp +++ b/src/lookup.cpp @@ -46,7 +46,7 @@ LookupBackend::LookupBackend() : impl(std::make_unique()) {} template LookupBackend::~LookupBackend() = default; -template void LookupBackend::init() { +template void LookupBackend::init(size_t) { impl->init(); } diff --git a/src/lookup_avx.cpp b/src/lookup_avx.cpp index 81c9d04..b3313ff 100644 --- a/src/lookup_avx.cpp +++ b/src/lookup_avx.cpp @@ -174,7 +174,8 @@ LookupAVXBackend::LookupAVXBackend() template LookupAVXBackend::~LookupAVXBackend() = default; -template void LookupAVXBackend::init() { +template +void LookupAVXBackend::init(size_t) { impl->init(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b333768..9be3731 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -26,3 +26,9 @@ add_test(NAME test_lookup COMMAND test_lookup) if(USE_MKL) add_test(NAME test_mkl COMMAND test_mkl) endif() + +if(USE_GPU) + add_executable(test_gpu test_gpu.cpp) + target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain) + add_test(NAME test_gpu COMMAND test_gpu) +endif() diff --git a/tests/test_gpu.cpp b/tests/test_gpu.cpp new file mode 100644 index 0000000..6e06419 --- /dev/null +++ b/tests/test_gpu.cpp @@ -0,0 +1,10 @@ +#include +#include + +#include "test_utils.hpp" + +TEST_CASE("sinf") { test_sinf(1e-1f); } + +TEST_CASE("cosf") { test_cosf(1e-1f); } + +TEST_CASE("sincosf") { test_sincosf(1e-1f); } diff --git a/tests/test_utils.hpp b/tests/test_utils.hpp index dc64c4e..e366bef 100644 --- a/tests/test_utils.hpp +++ b/tests/test_utils.hpp @@ -18,7 +18,7 @@ template inline void test_sinf(float tol) { ReferenceBackend ref; Backend backend; - backend.init(); + backend.init(N); ref.compute_sinf(N, x.data(), s_ref.data()); backend.compute_sinf(N, x.data(), s.data()); @@ -37,7 +37,7 @@ template inline void test_cosf(float tol) { ReferenceBackend ref; Backend backend; - backend.init(); + backend.init(N); ref.compute_cosf(N, x.data(), c_ref.data()); backend.compute_cosf(N, x.data(), c.data()); @@ -56,7 +56,7 @@ template inline void test_sincosf(float tol) { ReferenceBackend ref; Backend backend; - backend.init(); + backend.init(N); ref.compute_sincosf(N, x.data(), s_ref.data(), c_ref.data()); backend.compute_sincosf(N, x.data(), s.data(), c.data());