Add GPUBackend

2025-08-01 14:20:32 +00:00
parent 404fbd3c02
commit b7c13be6c0
17 changed files with 233 additions and 11 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,3 +9,12 @@ if(USE_MKL)
  target_sources(trigdx PRIVATE mkl.cpp)
  target_link_libraries(trigdx PRIVATE MKL::MKL)
 endif()
+
+if(USE_GPU)
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  add_library(gpu SHARED gpu/gpu.cu)
+  target_sources(trigdx PRIVATE gpu.cpp)
+  target_link_libraries(trigdx PRIVATE CUDA::cudart)
+  target_link_libraries(trigdx PRIVATE gpu)
+endif()
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -0,0 +1,94 @@
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "gpu/gpu.cuh"
+#include "trigdx/gpu.hpp"
+
+struct GPUBackend::Impl {
+
+  ~Impl() {
+    if (h_x) {
+      cudaFreeHost(h_x);
+    }
+    if (h_s) {
+      cudaFreeHost(h_s);
+    }
+    if (h_c) {
+      cudaFreeHost(h_c);
+    }
+    if (d_x) {
+      cudaFree(d_x);
+    }
+    if (d_s) {
+      cudaFree(d_s);
+    }
+    if (d_c) {
+      cudaFree(d_c);
+    }
+  }
+
+  void init(size_t n) {
+    const size_t bytes = n * sizeof(float);
+    cudaMallocHost(&h_x, bytes);
+    cudaMallocHost(&h_s, bytes);
+    cudaMallocHost(&h_c, bytes);
+    cudaMalloc(&d_x, bytes);
+    cudaMalloc(&d_s, bytes);
+    cudaMalloc(&d_c, bytes);
+  }
+
+  void compute_sinf(size_t n, const float *x, float *s) const {
+    const size_t bytes = n * sizeof(float);
+    std::memcpy(h_x, x, bytes);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
+    std::memcpy(s, h_s, bytes);
+  }
+
+  void compute_cosf(size_t n, const float *x, float *c) const {
+    const size_t bytes = n * sizeof(float);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
+    std::memcpy(c, h_c, bytes);
+  }
+
+  void compute_sincosf(size_t n, const float *x, float *s, float *c) const {
+    const size_t bytes = n * sizeof(float);
+    cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice);
+    launch_sincosf_kernel(d_x, d_s, d_c, n);
+    cudaMemcpy(h_s, d_s, bytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
+  }
+
+  float *h_x = nullptr;
+  float *h_s = nullptr;
+  float *h_c = nullptr;
+  float *d_x = nullptr;
+  float *d_s = nullptr;
+  float *d_c = nullptr;
+};
+
+GPUBackend::GPUBackend() : impl(std::make_unique<Impl>()) {}
+
+GPUBackend::~GPUBackend() = default;
+
+void GPUBackend::init(size_t n) { impl->init(n); }
+
+void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
+  impl->compute_sinf(n, x, s);
+}
+
+void GPUBackend::compute_cosf(size_t n, const float *x, float *c) const {
+  impl->compute_cosf(n, x, c);
+}
+
+void GPUBackend::compute_sincosf(size_t n, const float *x, float *s,
+                                 float *c) const {
+  impl->compute_sincosf(n, x, s, c);
+}
--- a/src/gpu/gpu.cu
+++ b/src/gpu/gpu.cu
@@ -0,0 +1,56 @@
+#include <cuda_runtime.h>
+
+#include "gpu.cuh"
+
+__global__ void kernel_sinf(const float *__restrict__ x, float *__restrict__ s,
+                            size_t n) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    // s[idx] = __sinf(x[idx]);
+    s[idx] = sinf(x[idx]);
+  }
+}
+
+__global__ void kernel_cosf(const float *__restrict__ x, float *__restrict__ c,
+                            size_t n) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    // c[idx] = __cosf(x[idx]);
+    c[idx] = cosf(x[idx]);
+  }
+}
+
+__global__ void kernel_sincosf(const float *__restrict__ x,
+                               float *__restrict__ s, float *__restrict__ c,
+                               size_t n) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    // __sincosf(x[idx], &s[idx], &c[idx]);
+    s[idx] = sinf(x[idx]);
+    c[idx] = cosf(x[idx]);
+  }
+}
+
+namespace {
+inline dim3 make_grid(size_t n, size_t threadsPerBlock = 256) {
+  return dim3((n + threadsPerBlock - 1) / threadsPerBlock);
+}
+} // namespace
+
+void launch_sinf_kernel(const float *d_x, float *d_s, size_t n) {
+  dim3 blocks(256);
+  dim3 grid = make_grid(n, blocks.x);
+  kernel_sinf<<<grid, blocks>>>(d_x, d_s, n);
+}
+
+void launch_cosf_kernel(const float *d_x, float *d_c, size_t n) {
+  dim3 blocks(256);
+  dim3 grid = make_grid(n, blocks.x);
+  kernel_cosf<<<grid, blocks>>>(d_x, d_c, n);
+}
+
+void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c, size_t n) {
+  dim3 blocks(256);
+  dim3 grid = make_grid(n, blocks.x);
+  kernel_sincosf<<<grid, blocks>>>(d_x, d_s, d_c, n);
+}
--- a/src/gpu/gpu.cuh
+++ b/src/gpu/gpu.cuh
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <cstddef>
+
+void launch_sinf_kernel(const float *d_x, float *d_s, size_t n);
+void launch_cosf_kernel(const float *d_x, float *d_c, size_t n);
+void launch_sincosf_kernel(const float *d_x, float *d_s, float *d_c,
+                           std::size_t n);
--- a/src/lookup.cpp
+++ b/src/lookup.cpp
@@ -46,7 +46,7 @@ LookupBackend<NR_SAMPLES>::LookupBackend() : impl(std::make_unique<Impl>()) {}
 template <size_t NR_SAMPLES>
 LookupBackend<NR_SAMPLES>::~LookupBackend() = default;

-template <size_t NR_SAMPLES> void LookupBackend<NR_SAMPLES>::init() {
+template <size_t NR_SAMPLES> void LookupBackend<NR_SAMPLES>::init(size_t) {
  impl->init();
 }

--- a/src/lookup_avx.cpp
+++ b/src/lookup_avx.cpp
@@ -174,7 +174,8 @@ LookupAVXBackend<NR_SAMPLES>::LookupAVXBackend()
 template <std::size_t NR_SAMPLES>
 LookupAVXBackend<NR_SAMPLES>::~LookupAVXBackend() = default;

-template <std::size_t NR_SAMPLES> void LookupAVXBackend<NR_SAMPLES>::init() {
+template <std::size_t NR_SAMPLES>
+void LookupAVXBackend<NR_SAMPLES>::init(size_t) {
  impl->init();
 }