Add allocate_memory and free_memory

This commit is contained in:
Bram Veenboer
2025-09-02 11:55:33 +02:00
parent 716f323b26
commit 8df4bbf54e
4 changed files with 59 additions and 12 deletions

View File

@@ -7,8 +7,8 @@
#include <benchmark/benchmark.h> #include <benchmark/benchmark.h>
void init_x(std::vector<float> &x) { void init_x(float *x, size_t n) {
for (size_t i = 0; i < x.size(); ++i) { for (size_t i = 0; i < n; ++i) {
x[i] = (i % 360) * 0.0174533f; // degrees to radians x[i] = (i % 360) * 0.0174533f; // degrees to radians
} }
} }
@@ -16,24 +16,31 @@ void init_x(std::vector<float> &x) {
template <typename Backend> template <typename Backend>
static void benchmark_sinf(benchmark::State &state) { static void benchmark_sinf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0)); const size_t N = static_cast<size_t>(state.range(0));
std::vector<float> x(N), s(N);
init_x(x);
Backend backend; Backend backend;
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.init(N); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] = state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start) std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() / .count() /
1.e3; 1.e3;
init_x(x, N);
for (auto _ : state) { for (auto _ : state) {
backend.compute_sinf(N, x.data(), s.data()); backend.compute_sinf(N, x, s);
benchmark::DoNotOptimize(s); benchmark::DoNotOptimize(s);
} }
backend.free_memory(x);
backend.free_memory(s);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N)); static_cast<int64_t>(N));
} }
@@ -41,24 +48,31 @@ static void benchmark_sinf(benchmark::State &state) {
template <typename Backend> template <typename Backend>
static void benchmark_cosf(benchmark::State &state) { static void benchmark_cosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0)); const size_t N = static_cast<size_t>(state.range(0));
std::vector<float> x(N), c(N);
init_x(x);
Backend backend; Backend backend;
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.init(N); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] = state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start) std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() / .count() /
1.e3; 1.e3;
init_x(x, N);
for (auto _ : state) { for (auto _ : state) {
backend.compute_cosf(N, x.data(), c.data()); backend.compute_cosf(N, x, c);
benchmark::DoNotOptimize(c); benchmark::DoNotOptimize(c);
} }
backend.free_memory(x);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N)); static_cast<int64_t>(N));
} }
@@ -66,25 +80,35 @@ static void benchmark_cosf(benchmark::State &state) {
template <typename Backend> template <typename Backend>
static void benchmark_sincosf(benchmark::State &state) { static void benchmark_sincosf(benchmark::State &state) {
const size_t N = static_cast<size_t>(state.range(0)); const size_t N = static_cast<size_t>(state.range(0));
std::vector<float> x(N), s(N), c(N);
init_x(x);
Backend backend; Backend backend;
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
backend.init(N); backend.init(N);
float *x =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *s =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
float *c =
reinterpret_cast<float *>(backend.allocate_memory(N * sizeof(float)));
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
state.counters["init_ms"] = state.counters["init_ms"] =
std::chrono::duration_cast<std::chrono::microseconds>(end - start) std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count() / .count() /
1.e3; 1.e3;
init_x(x, N);
for (auto _ : state) { for (auto _ : state) {
backend.compute_sincosf(N, x.data(), s.data(), c.data()); backend.compute_sincosf(N, x, s, c);
benchmark::DoNotOptimize(s); benchmark::DoNotOptimize(s);
benchmark::DoNotOptimize(c); benchmark::DoNotOptimize(c);
} }
backend.free_memory(x);
backend.free_memory(s);
backend.free_memory(c);
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
static_cast<int64_t>(N)); static_cast<int64_t>(N));
} }

View File

@@ -12,6 +12,8 @@ public:
~GPUBackend() override; ~GPUBackend() override;
void init(size_t n = 0) override; void init(size_t n = 0) override;
void *allocate_memory(size_t bytes) const override;
void free_memory(void *ptr) const override;
void compute_sinf(size_t n, const float *x, float *s) const override; void compute_sinf(size_t n, const float *x, float *s) const override;
void compute_cosf(size_t n, const float *x, float *c) const override; void compute_cosf(size_t n, const float *x, float *c) const override;
void compute_sincosf(size_t n, const float *x, float *s, void compute_sincosf(size_t n, const float *x, float *s,

View File

@@ -1,6 +1,7 @@
#pragma once #pragma once
#include <cstddef> #include <cstddef>
#include <cstdlib>
// Base interface for all math backends // Base interface for all math backends
class Backend { class Backend {
@@ -10,6 +11,12 @@ public:
// Optional initialization // Optional initialization
virtual void init(size_t n = 0) {} virtual void init(size_t n = 0) {}
virtual void *allocate_memory(size_t bytes) const {
return std::malloc(bytes);
};
virtual void free_memory(void *ptr) const { std::free(ptr); };
// Compute sine for n elements // Compute sine for n elements
virtual void compute_sinf(size_t n, const float *x, float *s) const = 0; virtual void compute_sinf(size_t n, const float *x, float *s) const = 0;

View File

@@ -19,9 +19,17 @@ struct GPUBackend::Impl {
} }
} }
void *allocate_memory(size_t bytes) const {
void *ptr;
cudaMallocHost(&ptr, bytes);
return ptr;
}
void free_memory(void *ptr) const { cudaFreeHost(ptr); }
void init(size_t n) { void init(size_t n) {
const size_t bytes = n * sizeof(float); const size_t bytes = n * sizeof(float);
cudaMallocHost(&h_x, bytes); h_x = reinterpret_cast<float *>(allocate_memory(bytes));
cudaMalloc(&d_x, bytes); cudaMalloc(&d_x, bytes);
} }
@@ -71,6 +79,12 @@ GPUBackend::~GPUBackend() = default;
void GPUBackend::init(size_t n) { impl->init(n); } void GPUBackend::init(size_t n) { impl->init(n); }
void *GPUBackend::allocate_memory(size_t bytes) const {
return impl->allocate_memory(bytes);
}
void GPUBackend::free_memory(void *ptr) const { impl->free_memory(ptr); }
void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const { void GPUBackend::compute_sinf(size_t n, const float *x, float *s) const {
impl->compute_sinf(n, x, s); impl->compute_sinf(n, x, s);
} }