Add GPUBackend

2025-08-01 14:20:32 +00:00
parent 404fbd3c02
commit b7c13be6c0
17 changed files with 233 additions and 11 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

 option(USE_MKL "Enable Intel MKL backend" OFF)
+option(USE_GPU "Enable GPU backend" OFF)

 include_directories(${PROJECT_SOURCE_DIR}/include)

--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -11,3 +11,8 @@ if(USE_MKL)
  add_executable(benchmark_mkl benchmark_mkl.cpp)
  target_link_libraries(benchmark_mkl PRIVATE trigdx)
 endif()
+
+if(USE_GPU)
+  add_executable(benchmark_gpu benchmark_gpu.cpp)
+  target_link_libraries(benchmark_gpu PRIVATE trigdx gpu)
+endif()
--- a/benchmarks/benchmark_gpu.cpp
+++ b/benchmarks/benchmark_gpu.cpp
@@ -0,0 +1,9 @@
+#include <trigdx/gpu.hpp>
+
+#include "benchmark_utils.hpp"
+
+int main() {
+  benchmark_sinf<GPUBackend>();
+  benchmark_cosf<GPUBackend>();
+  benchmark_sincosf<GPUBackend>();
+}
--- a/benchmarks/benchmark_utils.hpp
+++ b/benchmarks/benchmark_utils.hpp
@@ -25,7 +25,7 @@ template <typename Backend> inline void benchmark_sinf() {
    x[i] = (i % 360) * 0.0174533f; // degrees to radians

  Backend backend;
-  backend.init();
+  backend.init(N);

  auto start = std::chrono::high_resolution_clock::now();
  backend.compute_sinf(N, x.data(), s.data());
@@ -44,7 +44,7 @@ template <typename Backend> inline void benchmark_cosf() {
    x[i] = (i % 360) * 0.0174533f; // degrees to radians

  Backend backend;
-  backend.init();
+  backend.init(N);

  auto start = std::chrono::high_resolution_clock::now();
  backend.compute_cosf(N, x.data(), c.data());
@@ -63,7 +63,7 @@ template <typename Backend> inline void benchmark_sincosf() {
    x[i] = (i % 360) * 0.0174533f; // degrees to radians

  Backend backend;
-  backend.init();
+  backend.init(N);

  auto start = std::chrono::high_resolution_clock::now();
  backend.compute_sincosf(N, x.data(), s.data(), c.data());
--- a/include/trigdx/gpu.hpp
+++ b/include/trigdx/gpu.hpp
@@ -0,0 +1,23 @@
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+
+#include "interface.hpp"
+
+class GPUBackend : public Backend {
+public:
+  GPUBackend();
+  ~GPUBackend() override;
+
+  void init(size_t n = 0) override;
+  void compute_sinf(size_t n, const float *x, float *s) const override;
+  void compute_cosf(size_t n, const float *x, float *c) const override;
+  void compute_sincosf(size_t n, const float *x, float *s,
+                       float *c) const override;
+
+private:
+  struct Impl;
+  std::unique_ptr<Impl> impl;
+};
--- a/include/trigdx/interface.hpp
+++ b/include/trigdx/interface.hpp
@@ -8,7 +8,7 @@ public:
  virtual ~Backend() = default;

  // Optional initialization
-  virtual void init() {}
+  virtual void init(size_t n = 0) {}

  // Compute sine for n elements
  virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;
--- a/include/trigdx/lookup.hpp
+++ b/include/trigdx/lookup.hpp
@@ -10,7 +10,7 @@ public:
  LookupBackend();
  ~LookupBackend() override;

-  void init() override;
+  void init(size_t n = 0) override;
  void compute_sinf(size_t n, const float *x, float *s) const override;
  void compute_cosf(size_t n, const float *x, float *c) const override;
  void compute_sincosf(size_t n, const float *x, float *s,
--- a/include/trigdx/lookup_avx.hpp
+++ b/include/trigdx/lookup_avx.hpp
@@ -10,7 +10,7 @@ public:
  LookupAVXBackend();
  ~LookupAVXBackend() override;

-  void init() override;
+  void init(size_t n = 0) override;
  void compute_sinf(std::size_t n, const float *x, float *s) const override;
  void compute_cosf(std::size_t n, const float *x, float *c) const override;
  void compute_sincosf(std::size_t n, const float *x, float *s,
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,3 +9,12 @@ if(USE_MKL)
  target_sources(trigdx PRIVATE mkl.cpp)
  target_link_libraries(trigdx PRIVATE MKL::MKL)
 endif()
+
+if(USE_GPU)
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  add_library(gpu SHARED gpu/gpu.cu)
+  target_sources(trigdx PRIVATE gpu.cpp)
+  target_link_libraries(trigdx PRIVATE CUDA::cudart)
+  target_link_libraries(trigdx PRIVATE gpu)
+endif()
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -0,0 +1,94 @@
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "gpu/gpu.cuh"
+#include "trigdx/gpu.hpp"
+
+struct GPUBackend::Impl {
+
+  ~Impl() {
+    if (h_x) {
+      cudaFreeHost(h_x);
+    }
+    if (h_s) {
+      cudaFreeHost(h_s);
+    }
+    if (h_c) {
+      cudaFreeHost(h_c);
+    }
+    if (d_x) {
+      cudaFree(d_x);
+    }
+    if (d_s) {
+      cudaFree(d_s);
+    }
+    if (d_c) {
+      cudaFree(d_c);
+    }
+  }
+
+  void init(size_t n) {
+    const size_t bytes = n * sizeof(float);
+    cudaMallocHost(&h_x, bytes);
+    cudaMallocHost(&h_s, bytes);
+    cudaMallocHost(&h_c, bytes);
+    cudaMalloc(&d_x, bytes);
+    cudaMalloc(&d_s, bytes);
+    cudaMalloc(&d_c, bytes);
+  }
+
+  void compute_sinf(size_t n, const float *x, float *s) const {
+    const size_t bytes = n * sizeof(float);
+    std::memcpy(h_x, x, bytes);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
+    std::memcpy(s, h_s, bytes);
+  }
+
+  void compute_cosf(size_t n, const float *x, float *c) const {
+    const size_t bytes = n * sizeof(float);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
+    std::memcpy(c, h_c, bytes);
+  }
+
+  void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
+    const size_t bytes = n * sizeof(float);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
+  }
+
+  float *h_x = nullptr;
+  float *h_s = nullptr;
+  float *h_c = nullptr;
+  float *d_x = nullptr;
+  float *d_s = nullptr;
+  float *d_c = nullptr;
+};
+
+GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
+
+GPUBackend::~GPUBackend() = default;
+
+void GPUBackend::init(size_t n) { impl->init(n); }
+
+void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
+  impl->compute_sinf(n, x, s);
+}
+
+void GPUBackend::compute_cosf(size_t n, const float *x, float *c) const {
+  impl->compute_cosf(n, x, c);
+}
+
+void GPUBackend::compute_sincosf(size_t n, const float *x, float *s,
+                                 float *c) const {
+  impl->compute_sincosf(n, x, s, c);
+}
--- a/src/gpu/gpu.cu
+++ b/src/gpu/gpu.cu
@@ -0,0 +1,56 @@
+#include <cuda_runtime.h>
+
+#include "gpu.cuh"
+
+__global__ void kernel_sinf(const float *__restrict__ x, float *__restrict__ s,
+                            size_t n) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    // s[idx] = __sinf(x[idx]);
+    s[idx] = sinf(x[idx]);
+  }
+}
+
+__global__ void kernel_cosf(const float *__restrict__ x, float *__restrict__ c,
+                            size_t n) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    // c[idx] = __cosf(x[idx]);
+    c[idx] = cosf(x[idx]);
+  }
+}
+
+__global__ void kernel_sincosf(const float *__restrict__ x,
+                               float *__restrict__ s, float *__restrict__ c,
+                               size_t n) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    // __sincosf(x[idx], &s[idx], &c[idx]);
+    s[idx] = sinf(x[idx]);
+    c[idx] = cosf(x[idx]);
+  }
+}
+
+namespace {
+inline dim3 make_grid(size_t n, size_t threadsPerBlock = 256) {
+  return dim3((n + threadsPerBlock - 1) / threadsPerBlock);
+}
+} // namespace
+
+void launch_sinf_kernel(const float *d_x, float *d_s, size_t n) {
+  dim3 blocks(256);
+  dim3 grid = make_grid(n, blocks.x);
+  kernel_sinf<<<grid, blocks>>>(d_x, d_s, n);
+}
+
+void launch_cosf_kernel(const float *d_x, float *d_c, size_t n) {
+  dim3 blocks(256);
+  dim3 grid = make_grid(n, blocks.x);
+  kernel_cosf<<<grid, blocks>>>(d_x, d_c, n);
+}
+
+void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, size_t n) {
+  dim3 blocks(256);
+  dim3 grid = make_grid(n, blocks.x);
+  kernel_sincosf<<<grid, blocks>>>(d_x, d_s, d_c, n);
+}
--- a/src/gpu/gpu.cuh
+++ b/src/gpu/gpu.cuh
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <cstddef>
+
+void launch_sinf_kernel(const float *d_x, float *d_s, size_t n);
+void launch_cosf_kernel(const float *d_x, float *d_c, size_t n);
+void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c,
+                           std::size_t n);
--- a/src/lookup.cpp
+++ b/src/lookup.cpp
@@ -46,7 +46,7 @@ LookupBackend<NR_SAMPLES>::LookupBackend() : impl(std::make_unique<Impl>()) {}
 template <size_t NR_SAMPLES>
 LookupBackend<NR_SAMPLES>::~LookupBackend() = default;

-template <size_t NR_SAMPLES> void LookupBackend<NR_SAMPLES>::init() {
+template <size_t NR_SAMPLES> void LookupBackend<NR_SAMPLES>::init(size_t) {
  impl->init();
 }

--- a/src/lookup_avx.cpp
+++ b/src/lookup_avx.cpp
@@ -174,7 +174,8 @@ LookupAVXBackend<NR_SAMPLES>::LookupAVXBackend()
 template <std::size_t NR_SAMPLES>
 LookupAVXBackend<NR_SAMPLES>::~LookupAVXBackend() = default;

-template <std::size_t NR_SAMPLES> void LookupAVXBackend<NR_SAMPLES>::init() {
+template <std::size_t NR_SAMPLES>
+void LookupAVXBackend<NR_SAMPLES>::init(size_t) {
  impl->init();
 }

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -26,3 +26,9 @@ add_test(NAME test_lookup COMMAND test_lookup)
 if(USE_MKL)
  add_test(NAME test_mkl COMMAND test_mkl)
 endif()
+
+if(USE_GPU)
+  add_executable(test_gpu test_gpu.cpp)
+  target_link_libraries(test_gpu PRIVATE trigdx Catch2::Catch2WithMain)
+  add_test(NAME test_gpu COMMAND test_gpu)
+endif()
--- a/tests/test_gpu.cpp
+++ b/tests/test_gpu.cpp
@@ -0,0 +1,10 @@
+#include <catch2/catch_test_macros.hpp>
+#include <trigdx/gpu.hpp>
+
+#include "test_utils.hpp"
+
+TEST_CASE("sinf") { test_sinf<GPUBackend>(1e-1f); }
+
+TEST_CASE("cosf") { test_cosf<GPUBackend>(1e-1f); }
+
+TEST_CASE("sincosf") { test_sincosf<GPUBackend>(1e-1f); }
--- a/tests/test_utils.hpp
+++ b/tests/test_utils.hpp
@@ -18,7 +18,7 @@ template <typename Backend> inline void test_sinf(float tol) {

  ReferenceBackend ref;
  Backend backend;
-  backend.init();
+  backend.init(N);

  ref.compute_sinf(N, x.data(), s_ref.data());
  backend.compute_sinf(N, x.data(), s.data());
@@ -37,7 +37,7 @@ template <typename Backend> inline void test_cosf(float tol) {

  ReferenceBackend ref;
  Backend backend;
-  backend.init();
+  backend.init(N);

  ref.compute_cosf(N, x.data(), c_ref.data());
  backend.compute_cosf(N, x.data(), c.data());
@@ -56,7 +56,7 @@ template <typename Backend> inline void test_sincosf(float tol) {

  ReferenceBackend ref;
  Backend backend;
-  backend.init();
+  backend.init(N);

  ref.compute_sincosf(N, x.data(), s_ref.data(), c_ref.data());
  backend.compute_sincosf(N, x.data(), s.data(), c.data());