Compiler Explorer

Source code

#include <vector>
#include <iostream>
#include <vector>
#include <stdexcept>
#include <iostream>
#include <cuda.h>
#include <cassert>

void check(const auto err, const int line) {
    if (err != cudaSuccess) {
        std::cerr << "error: " << cudaGetErrorString(err) << "in line: " << line << "\n";
    }
}
#define CUDA(err) check(err, __LINE__)

template <typename T>
class Vec {
    size_t size; 
    // points to a memory location on device NOT host
    T* data_ptr;

public:
    static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value, 
                "Vec<T> can only be instantiated with float or double.");

// initialize vector of zeros of dimension size 
    Vec(size_t size);

// initialize from vector of values 
    Vec(const std::vector<T>& vec);

// destructor for device memory 
    ~Vec();

// dot product [TODO: Add an exterior product]
    template <typename U> 
    friend U dot_product(const Vec<U>& A, const Vec<U>& B);
};

extern template class Vec<float>;
extern template class Vec<double>;

constexpr auto BLOCK_SIZE = 512u;
static_assert(BLOCK_SIZE <= 1024);

// dot product kernel using shared memory 
template <typename T>
__global__ void vec_dot(const T* vecA, const T* vecB, const int size, T* out) {
    static_assert(std::is_arithmetic<T>::value, "T must be a numeric type");
    assert(BLOCK_SIZE == blockDim.x);
    __shared__ T sdata[BLOCK_SIZE];

const auto tid = threadIdx.x;
    sdata[tid] = 0;
    size_t idx = threadIdx.x + blockDim.x * blockIdx.x;

while (idx < size) {
        sdata[tid] += vecA[idx] * vecB[idx]; 
        idx += blockDim.x * gridDim.x;   
    }

for (auto s = blockDim.x/2; s > 0; s >>= 1) {
        __syncthreads();
        if (tid < s) { sdata[tid] += vecA[tid + s] * vecB[tid + s]; }
    }
    __syncthreads();
    if (tid == 0) { atomicAdd(out, sdata[0]); }
}

template <typename T> 
T dot_product(const Vec<T>& A, const Vec<T>& B) {
    if (A.size != B.size) {
        throw std::invalid_argument("Vectors are not the same size.");
    }

T* out_device; 
    CUDA(cudaMalloc(&out_device, sizeof(T)));
    vec_dot<<<100, BLOCK_SIZE>>>(A.data_ptr, B.data_ptr, A.size, out_device);

CUDA(cudaDeviceSynchronize());

T out_host; 
    CUDA(cudaMemcpy(&out_host, out_device, sizeof(T), cudaMemcpyDeviceToHost));

CUDA(cudaFree(out_device));

return out_host; 
}

template <typename T>
Vec<T>::~Vec() {
    CUDA(cudaFree(data_ptr));
}

template <typename T>
Vec<T>::Vec(size_t size) : size(size), data_ptr(nullptr) {
    CUDA(cudaMalloc(&data_ptr, size * sizeof(T)));
    
    CUDA(cudaMemset(data_ptr, 0, size * sizeof(T)));
}

template <typename T>
Vec<T>::Vec(const std::vector<T>& vec) : size(vec.size()) {
    CUDA(cudaMalloc(&data_ptr /*nullptr*/, size * sizeof(T)));
    CUDA(cudaMemcpy(data_ptr, vec.data(), size * sizeof(T), cudaMemcpyHostToDevice));
}

template class Vec<float>;
template class Vec<double>;
template float dot_product<float>(const Vec<float>&, const Vec<float>&);
template double dot_product<double>(const Vec<double>&, const Vec<double>&);

int main()
{
    std::vector<float> vec; 
    for (auto i = 0; i < 10000; i++) {
        vec.push_back(1.0f); 
    }

Vec<float> vector(vec);

auto product = dot_product(vector, vector);
    std::cout << "dot product = " << product << "\n";

return 0;
}