diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp index 228b88e..5c9e3ab 100644 --- a/benchmarks/benchmark_utils.hpp +++ b/benchmarks/benchmark_utils.hpp @@ -7,8 +7,8 @@ #include -void init_x(std::vector &x) { - for (size_t i = 0; i < x.size(); ++i) { +void init_x(float *x, size_t n) { + for (size_t i = 0; i < n; ++i) { x[i] = (i % 360) * 0.0174533f; // degrees to radians } } @@ -16,24 +16,31 @@ void init_x(std::vector &x) { template static void benchmark_sinf(benchmark::State &state) { const size_t N = static_cast(state.range(0)); - std::vector x(N), s(N); - init_x(x); Backend backend; auto start = std::chrono::high_resolution_clock::now(); backend.init(N); + float *x = + reinterpret_cast(backend.allocate_memory(N * sizeof(float))); + float *s = + reinterpret_cast(backend.allocate_memory(N * sizeof(float))); auto end = std::chrono::high_resolution_clock::now(); state.counters["init_ms"] = std::chrono::duration_cast(end - start) .count() / 1.e3; + init_x(x, N); + for (auto _ : state) { - backend.compute_sinf(N, x.data(), s.data()); + backend.compute_sinf(N, x, s); benchmark::DoNotOptimize(s); } + backend.free_memory(x); + backend.free_memory(s); + state.SetItemsProcessed(static_cast(state.iterations()) * static_cast(N)); } @@ -41,24 +48,31 @@ static void benchmark_sinf(benchmark::State &state) { template static void benchmark_cosf(benchmark::State &state) { const size_t N = static_cast(state.range(0)); - std::vector x(N), c(N); - init_x(x); Backend backend; auto start = std::chrono::high_resolution_clock::now(); backend.init(N); + float *x = + reinterpret_cast(backend.allocate_memory(N * sizeof(float))); + float *c = + reinterpret_cast(backend.allocate_memory(N * sizeof(float))); auto end = std::chrono::high_resolution_clock::now(); state.counters["init_ms"] = std::chrono::duration_cast(end - start) .count() / 1.e3; + init_x(x, N); + for (auto _ : state) { - backend.compute_cosf(N, x.data(), c.data()); + backend.compute_cosf(N, x, c); benchmark::DoNotOptimize(c); } + backend.free_memory(x); + backend.free_memory(c); + state.SetItemsProcessed(static_cast(state.iterations()) * static_cast(N)); } @@ -66,25 +80,35 @@ static void benchmark_cosf(benchmark::State &state) { template static void benchmark_sincosf(benchmark::State &state) { const size_t N = static_cast(state.range(0)); - std::vector x(N), s(N), c(N); - init_x(x); Backend backend; auto start = std::chrono::high_resolution_clock::now(); backend.init(N); + float *x = + reinterpret_cast(backend.allocate_memory(N * sizeof(float))); + float *s = + reinterpret_cast(backend.allocate_memory(N * sizeof(float))); + float *c = + reinterpret_cast(backend.allocate_memory(N * sizeof(float))); auto end = std::chrono::high_resolution_clock::now(); state.counters["init_ms"] = std::chrono::duration_cast(end - start) .count() / 1.e3; + init_x(x, N); + for (auto _ : state) { - backend.compute_sincosf(N, x.data(), s.data(), c.data()); + backend.compute_sincosf(N, x, s, c); benchmark::DoNotOptimize(s); benchmark::DoNotOptimize(c); } + backend.free_memory(x); + backend.free_memory(s); + backend.free_memory(c); + state.SetItemsProcessed(static_cast(state.iterations()) * static_cast(N)); } diff --git a/include/trigdx/gpu.hpp b/include/trigdx/gpu.hpp index 8fb4555..6c3d3aa 100644 --- a/include/trigdx/gpu.hpp +++ b/include/trigdx/gpu.hpp @@ -12,6 +12,8 @@ public: ~GPUBackend() override; void init(size_t n = 0) override; + void *allocate_memory(size_t bytes) const override; + void free_memory(void *ptr) const override; void compute_sinf(size_t n, const float *x, float *s) const override; void compute_cosf(size_t n, const float *x, float *c) const override; void compute_sincosf(size_t n, const float *x, float *s, diff --git a/include/trigdx/interface.hpp b/include/trigdx/interface.hpp index 9975486..bbc61ea 100644 --- a/include/trigdx/interface.hpp +++ b/include/trigdx/interface.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include // Base interface for all math backends class Backend { @@ -10,6 +11,12 @@ public: // Optional initialization virtual void init(size_t n = 0) {} + virtual void *allocate_memory(size_t bytes) const { + return std::malloc(bytes); + }; + + virtual void free_memory(void *ptr) const { std::free(ptr); }; + // Compute sine for n elements virtual void compute_sinf(size_t n, const float *x, float *s) const = 0; diff --git a/src/gpu.cpp b/src/gpu.cpp index 910b1de..f13cc92 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -19,9 +19,17 @@ struct GPUBackend::Impl { } } + void *allocate_memory(size_t bytes) const { + void *ptr; + cudaMallocHost(&ptr, bytes); + return ptr; + } + + void free_memory(void *ptr) const { cudaFreeHost(ptr); } + void init(size_t n) { const size_t bytes = n * sizeof(float); - cudaMallocHost(&h_x, bytes); + h_x = reinterpret_cast(allocate_memory(bytes)); cudaMalloc(&d_x, bytes); } @@ -71,6 +79,12 @@ GPUBackend::~GPUBackend() = default; void GPUBackend::init(size_t n) { impl->init(n); } +void *GPUBackend::allocate_memory(size_t bytes) const { + return impl->allocate_memory(bytes); +} + +void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); } + void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const { impl->compute_sinf(n, x, s); }