Compiler Explorer

Source code

#include <cuda_runtime.h>
#include <iostream>

__global__ void traverseArray(int *d_array, int width, int height) {
    auto x = blockIdx.x * blockDim.x + threadIdx.x;
    auto y = blockIdx.y * blockDim.y + threadIdx.y;

if (x < width && y < height) {
        d_array[y * width + x] *= 2;
    }
}

struct CudaKernelExecutionTime {
    template<typename F, typename... Args>
    float operator()(F &&f, Args &&... args) {
        // Start timing
        cudaEvent_t start, stop;
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

// Insert start event
        cudaEventRecord(start, nullptr);
        // call kernel
        f(std::forward<Args>(args)...);
        // insert stop event
        cudaEventRecord(stop, nullptr);

cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);

// Free resources
        cudaEventDestroy(start);
        cudaEventDestroy(stop);

return time;
    }
};

int main() {
    int width = 10;
    int height = 10;
    int size = width * height;
    int *h_array = new int[size];
    int *d_array;
    cudaMalloc((void **) &d_array, size * sizeof(int));

// Initialize array on host
    for (int i = 0; i < size; i++) {
        h_array[i] = i;
    }

// Copy array from host to device
    cudaMemcpy(d_array, h_array, size * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel to traverse array
    dim3 blockSize(8, 8);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

// Run kernel
    auto launch_kernel = [&]() {
        traverseArray<<<gridSize, blockSize>>>(d_array, width, height);
    };
    CudaKernelExecutionTime timer;
    auto time = timer(launch_kernel);

// Print time
    std::cout << "Kernel execution time: " << time << "ms"
              << std::endl;

// Copy array back from device to host
    cudaMemcpy(h_array, d_array, size * sizeof(int), cudaMemcpyDeviceToHost);

// Print modified array
    for (int i = 0; i < size; i++) {
        printf("Array element %d: %d\n", i, h_array[i]);
    }

// Free device memory
    cudaFree(d_array);
    delete[] h_array;

return 0;
}