Compiler Explorer

Source code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>

void checkCudaError(cudaError_t result)
{
    if (result != cudaSuccess)
    {
        std::cout << "ERROR: " << result << std::endl;
    }
}

std::ostream& operator << (std::ostream & out, const float3 & v)
{
    out << "(" << v.x << "," << v.y << "," << v.z << ")";
    return out;
}

struct StructCombined
{
    float3 m_a;
    float3 m_b;
};

struct StructSplit
{
    float3 m_singleValue;
};

__global__
void splitStructKernel(StructSplit* splitOut,
                       const StructCombined* combinedIn)
{
    int index = threadIdx.x;

float3 result;
    StructCombined combinedStruct = combinedIn[0];

//based on this logic the first thread should get (1.,1.,1.)
    //and the second should get (2.,2.,2.) but they both seem to get (2.,2.,2.) in release mode!
    //if(index < 32){
    if (index == 0)
    {
        float3 result1 = combinedStruct.m_a;
        result = result1;
    }
    else
    {
        float3 result2 = combinedStruct.m_b;
        result = result2;
    }
    splitOut[index].m_singleValue = result;
    //}
}

int main()
{
    checkCudaError(cudaSetDevice(0));

StructCombined hostCombinedStruct;
    hostCombinedStruct.m_a = { 1.,1.,1. };
    hostCombinedStruct.m_b = { 2.,2.,2. };

StructCombined* deviceCombinedStruct;
    checkCudaError(cudaMalloc(&deviceCombinedStruct, sizeof(StructCombined)));
    checkCudaError(cudaMemcpy(deviceCombinedStruct, &hostCombinedStruct, sizeof(StructCombined), cudaMemcpyHostToDevice));

StructSplit hostSplitStruct[2];
    StructSplit* deviceSplitStruct;
    checkCudaError(cudaMalloc(&deviceSplitStruct, 2 * sizeof(StructSplit)));

splitStructKernel << <1, 2>>> (deviceSplitStruct, deviceCombinedStruct);
    checkCudaError(cudaDeviceSynchronize());

checkCudaError(cudaMemcpy(hostSplitStruct, deviceSplitStruct, 2 * sizeof(StructSplit), cudaMemcpyDeviceToHost));

std::cout << hostSplitStruct[0].m_singleValue << std::endl;
    std::cout << hostSplitStruct[1].m_singleValue << std::endl;
    
    checkCudaError(cudaFree(deviceCombinedStruct));
    checkCudaError(cudaFree(deviceSplitStruct));

return 0;
}