Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Zig
Javascript
GIMPLE
Ygen
sway
cuda source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
10.0.0 sm_75 CUDA-10.2
10.0.1 sm_75 CUDA-10.2
11.0.0 sm_75 CUDA-10.2
16.0.0 sm_90 CUDA-11.8
17.0.1(libc++) sm_90 CUDA-12.1
18.1.0(libc++) sm_90 CUDA-12.3.1
19.1.0 sm_90 CUDA-12.5.1
20.1.0 sm_90 CUDA-12.5.1
20.1.0 sm_90 CUDA-12.6.1
20.1.0 sm_90 CUDA-12.6.2
NVCC 10.0.130
NVCC 10.1.105
NVCC 10.1.168
NVCC 10.1.243
NVCC 10.2.89
NVCC 11.0.2
NVCC 11.0.3
NVCC 11.1.0
NVCC 11.1.1
NVCC 11.2.0
NVCC 11.2.1
NVCC 11.2.2
NVCC 11.3.0
NVCC 11.3.1
NVCC 11.4.0
NVCC 11.4.1
NVCC 11.4.2
NVCC 11.4.3
NVCC 11.4.4
NVCC 11.5.0
NVCC 11.5.1
NVCC 11.5.2
NVCC 11.6.0
NVCC 11.6.1
NVCC 11.6.2
NVCC 11.7.0
NVCC 11.7.1
NVCC 11.8.0
NVCC 12.0.0
NVCC 12.0.1
NVCC 12.1.0
NVCC 12.2.1
NVCC 12.3.1
NVCC 12.4.1
NVCC 12.5.1
NVCC 12.6.1
NVCC 12.6.2
NVCC 12.8.1
NVCC 9.1.85
NVCC 9.2.88
NVRTC 11.0.2
NVRTC 11.0.3
NVRTC 11.1.0
NVRTC 11.1.1
NVRTC 11.2.0
NVRTC 11.2.1
NVRTC 11.2.2
NVRTC 11.3.0
NVRTC 11.3.1
NVRTC 11.4.0
NVRTC 11.4.1
NVRTC 11.5.0
NVRTC 11.5.1
NVRTC 11.5.2
NVRTC 11.6.0
NVRTC 11.6.1
NVRTC 11.6.2
NVRTC 11.7.0
NVRTC 11.7.1
NVRTC 11.8.0
NVRTC 12.0.0
NVRTC 12.0.1
NVRTC 12.1.0
clang 7.0.0 sm_70 CUDA-9.1
clang 8.0.0 sm_75 CUDA-10.0
clang 9.0.0 sm_75 CUDA-10.1
clang rocm-4.5.2
clang rocm-5.0.2
clang rocm-5.1.3
clang rocm-5.2.3
clang rocm-5.3.2
clang rocm-5.7.0
clang rocm-6.0.2
clang rocm-6.1.2
clang rocm-6.2.4
clang rocm-6.3.3
clang rocm-6.4.0
clang staging rocm-6.1.2
clang staging rocm-6.2.4
clang staging rocm-6.3.3
clang staging rocm-6.4.0
clang trunk rocm-6.1.2
clang trunk rocm-6.2.4
clang trunk rocm-6.3.3
clang trunk rocm-6.4.0
trunk sm_100a CUDA-12.8.1
Options
Source code
#include <curand_kernel.h> #include <cuda_runtime.h> #include <iostream> #define BALANCE 10000.0f #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } struct Connection { int from; int to; float weight; }; struct Neuron { float bias; int id; }; struct Layer { Connection* Connections; Neuron* Neurons; int num_neurons; int num_connections; }; struct Network { Layer* layers; int num_neurons; int layer_num; float fitness = BALANCE; }; struct Population { Network* Networks; int generation_id; }; __global__ void CreateBasePopulation(Network* d_networks, int pop_num, int input_num, int output_num) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= pop_num) return; Network* net = &d_networks[idx]; net->num_neurons = input_num + output_num; net->layer_num = 2; net->fitness = BALANCE; Layer* layers = net->layers; layers[0].num_neurons = input_num; layers[0].num_connections = input_num * output_num; layers[1].num_neurons = output_num; layers[1].num_connections = 0; layers[1].Connections = nullptr; Connection* connections = layers[0].Connections; curandState state; curand_init(clock64(), idx, 0, &state); for (int i = 0; i < input_num; ++i) { Neuron* n = &layers[0].Neurons[i]; n->id = i; n->bias = 0.0f; for (int j = 0; j < output_num; ++j) { int conn_index = j + i * output_num; connections[conn_index].from = i; connections[conn_index].to = input_num + j; connections[conn_index].weight = (2.0f * curand_uniform(&state)) - 1.0f; } } for (int i = 0; i < output_num; ++i) { Neuron* n = &layers[1].Neurons[i]; n->id = input_num + i; n->bias = ((2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num)) / output_num; } } __global__ void Mutate(Network* device_networks, int initial_neuron_count, int population_size, float neuron_addition_probability, float neuron_mutation_probability, float connection_addition_probability, float connection_mutation_probability, float weight_mutation_rate, float bias_mutation_rate) { int network_index = threadIdx.x + blockIdx.x * blockDim.x; if (network_index >= population_size) return; Network& current_network = device_networks[network_index]; bool is_initial_network = (current_network.num_neurons == initial_neuron_count); __shared__ float highest_fitness; __shared__ int elite_network_index; if (threadIdx.x == 0) { highest_fitness = -1.0f; elite_network_index = 0; for (int i = 0; i < population_size; ++i) { if (device_networks[i].fitness >= highest_fitness) { highest_fitness = device_networks[i].fitness; elite_network_index = i; } } } __syncthreads(); curandState random_state; curand_init(clock64(), threadIdx.x, 0, &random_state); if (network_index == elite_network_index) return; float mutation_selector = curand_uniform(&random_state); if (mutation_selector < neuron_addition_probability) { int neuron_addition_attempts = 0; while (neuron_addition_attempts++ < 100) { float upper_layer_random = curand_uniform(&random_state); float lower_layer_random = curand_uniform(&random_state); int source_layer_index = (int)((lower_layer_random - lower_layer_random/10000.0f) * (current_network.layer_num - 1)); int target_layer_index = source_layer_index + 1 + (int)((upper_layer_random - upper_layer_random/10000.0f) * (current_network.layer_num - source_layer_index - 1)); Layer& source_layer = current_network.layers[source_layer_index]; Layer& target_layer = current_network.layers[target_layer_index]; if (source_layer.num_neurons == 0 || target_layer.num_neurons == 0) continue; float source_neuron_random = curand_uniform(&random_state); float target_neuron_random = curand_uniform(&random_state); int source_neuron_index = (int)((source_neuron_random - source_neuron_random / 10000.0f) * source_layer.num_neurons); int target_neuron_index = (int)((target_neuron_random - target_neuron_random / 10000.0f) * target_layer.num_neurons); Neuron& source_neuron = source_layer.Neurons[source_neuron_index]; Neuron& target_neuron = target_layer.Neurons[target_neuron_index]; if (target_layer_index - source_layer_index == 1) { Layer* expanded_layer_array; cudaError_t memory_error = cudaMalloc(&expanded_layer_array, (current_network.layer_num + 1) * sizeof(Layer)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc expanded_layer_array failed: %s\n", cudaGetErrorString(memory_error)); return; } for (int layer_idx = 0; layer_idx <= source_layer_index; ++layer_idx) { Layer copied_layer; Layer& source_current_layer = current_network.layers[layer_idx]; copied_layer.num_neurons = source_current_layer.num_neurons; copied_layer.num_connections = source_current_layer.num_connections; if (copied_layer.num_connections > 0) { memory_error = cudaMalloc(&copied_layer.Connections, copied_layer.num_connections * sizeof(Connection)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc Connections failed at copy layer %d: %s\n", layer_idx, cudaGetErrorString(memory_error)); return; } for (int conn_idx = 0; conn_idx < copied_layer.num_connections; ++conn_idx) { copied_layer.Connections[conn_idx] = source_current_layer.Connections[conn_idx]; } } else { copied_layer.Connections = nullptr; } memory_error = cudaMalloc(&copied_layer.Neurons, copied_layer.num_neurons * sizeof(Neuron)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc Neurons failed at copy layer %d: %s\n", layer_idx, cudaGetErrorString(memory_error)); return; } for (int neuron_idx = 0; neuron_idx < copied_layer.num_neurons; ++neuron_idx) { copied_layer.Neurons[neuron_idx] = source_current_layer.Neurons[neuron_idx]; } cudaFree(source_current_layer.Connections); cudaFree(source_current_layer.Neurons); expanded_layer_array[layer_idx] = copied_layer; } Layer intermediate_layer; memory_error = cudaMalloc(&intermediate_layer.Neurons, sizeof(Neuron)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc intermediate_layer.Neurons failed: %s\n", cudaGetErrorString(memory_error)); return; } memory_error = cudaMalloc(&intermediate_layer.Connections, sizeof(Connection)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc intermediate_layer.Connections failed: %s\n", cudaGetErrorString(memory_error)); return; } intermediate_layer.num_neurons = 1; intermediate_layer.num_connections = 1; Neuron inserted_neuron; inserted_neuron.bias = ((curand_uniform(&random_state) * 2.0f) - 1.0f) / 2.0f; inserted_neuron.id = current_network.num_neurons; intermediate_layer.Neurons[0] = inserted_neuron; Connection source_connection; source_connection.from = source_neuron.id; source_connection.to = inserted_neuron.id; source_connection.weight = (2.0f * curand_uniform(&random_state)) - 1.0f; Connection target_connection; target_connection.from = inserted_neuron.id; target_connection.to = target_neuron.id; target_connection.weight = (2.0f * curand_uniform(&random_state)) - 1.0f; Connection* expanded_source_connections; int expanded_source_connection_count = source_layer.num_connections + 1; memory_error = cudaMalloc(&expanded_source_connections, expanded_source_connection_count * sizeof(Connection)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc expanded_source_connections failed: %s\n", cudaGetErrorString(memory_error)); return; } for (int conn_idx = 0; conn_idx < source_layer.num_connections; ++conn_idx) { expanded_source_connections[conn_idx] = source_layer.Connections[conn_idx]; } expanded_source_connections[source_layer.num_connections] = source_connection; cudaFree(source_layer.Connections); source_layer.Connections = expanded_source_connections; source_layer.num_connections++; intermediate_layer.Connections[0] = target_connection; for (int layer_idx = target_layer_index; layer_idx < current_network.layer_num; ++layer_idx) { Layer copied_layer; Layer& source_current_layer = current_network.layers[layer_idx]; copied_layer.num_neurons = source_current_layer.num_neurons; copied_layer.num_connections = source_current_layer.num_connections; if (copied_layer.num_connections > 0) { memory_error = cudaMalloc(&copied_layer.Connections, copied_layer.num_connections * sizeof(Connection)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc Connections failed at copy layer %d: %s\n", layer_idx, cudaGetErrorString(memory_error)); return; } for (int conn_idx = 0; conn_idx < copied_layer.num_connections; ++conn_idx) { copied_layer.Connections[conn_idx] = source_current_layer.Connections[conn_idx]; } } else { copied_layer.Connections = nullptr; } memory_error = cudaMalloc(&copied_layer.Neurons, copied_layer.num_neurons * sizeof(Neuron)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc Neurons failed at copy layer %d: %s\n", layer_idx, cudaGetErrorString(memory_error)); return; } for (int neuron_idx = 0; neuron_idx < copied_layer.num_neurons; ++neuron_idx) { copied_layer.Neurons[neuron_idx] = source_current_layer.Neurons[neuron_idx]; } cudaFree(source_current_layer.Connections); cudaFree(source_current_layer.Neurons); expanded_layer_array[layer_idx + 1] = copied_layer; } expanded_layer_array[target_layer_index] = intermediate_layer; cudaFree(current_network.layers); current_network.layers = expanded_layer_array; current_network.layer_num++; current_network.num_neurons++; break; } else { float intermediate_layer_random = curand_uniform(&random_state); int insert_layer_index = source_layer_index + 1 + (int)((intermediate_layer_random - intermediate_layer_random/10000.0f) * (target_layer_index - source_layer_index - 1)); Layer& intermediate_layer = current_network.layers[insert_layer_index]; Neuron* expanded_neurons; int expanded_neuron_count = intermediate_layer.num_neurons + 1; if (cudaMalloc(&expanded_neurons, expanded_neuron_count * sizeof(Neuron)) != cudaSuccess) { printf("[ERROR] cudaMalloc expanded_neurons failed: %s\n", cudaGetErrorString(cudaGetLastError())); return; } for (int neuron_idx = 0; neuron_idx < intermediate_layer.num_neurons; ++neuron_idx) { expanded_neurons[neuron_idx] = intermediate_layer.Neurons[neuron_idx]; } Neuron inserted_neuron; inserted_neuron.bias = ((curand_uniform(&random_state) * 2.0f) - 1.0f) / 2.0f; inserted_neuron.id = current_network.num_neurons; expanded_neurons[intermediate_layer.num_neurons] = inserted_neuron; cudaFree(intermediate_layer.Neurons); intermediate_layer.Neurons = expanded_neurons; intermediate_layer.num_neurons++; current_network.num_neurons++; Connection* expanded_target_connections; int expanded_target_connection_count = intermediate_layer.num_connections + 1; if (cudaMalloc(&expanded_target_connections, expanded_target_connection_count * sizeof(Connection)) != cudaSuccess) { printf("[ERROR] cudaMalloc expanded_target_connections failed: %s\n", cudaGetErrorString(cudaGetLastError())); return; } for (int conn_idx = 0; conn_idx < intermediate_layer.num_connections; ++conn_idx) { expanded_target_connections[conn_idx] = intermediate_layer.Connections[conn_idx]; } Connection forward_connection; forward_connection.from = inserted_neuron.id; forward_connection.to = target_neuron.id; forward_connection.weight = (2.0f * curand_uniform(&random_state)) - 1.0f; expanded_target_connections[intermediate_layer.num_connections] = forward_connection; cudaFree(intermediate_layer.Connections); intermediate_layer.Connections = expanded_target_connections; intermediate_layer.num_connections++; Connection* expanded_source_connections; int expanded_source_connection_count = source_layer.num_connections + 1; if (cudaMalloc(&expanded_source_connections, expanded_source_connection_count * sizeof(Connection)) != cudaSuccess) { printf("[ERROR] cudaMalloc expanded_source_connections failed: %s\n", cudaGetErrorString(cudaGetLastError())); return; } for (int conn_idx = 0; conn_idx < source_layer.num_connections; ++conn_idx) { expanded_source_connections[conn_idx] = source_layer.Connections[conn_idx]; } Connection backward_connection; backward_connection.from = source_neuron.id; backward_connection.to = inserted_neuron.id; backward_connection.weight = (2.0f * curand_uniform(&random_state)) - 1.0f; expanded_source_connections[source_layer.num_connections] = backward_connection; cudaFree(source_layer.Connections); source_layer.Connections = expanded_source_connections; source_layer.num_connections++; } break; } } float connection_mutation_selector = curand_uniform(&random_state); if (!is_initial_network && connection_mutation_selector < connection_addition_probability) { int connection_addition_attempts = 0; while (connection_addition_attempts++ < 65) { float upper_layer_random = curand_uniform(&random_state); float lower_layer_random = curand_uniform(&random_state); int source_layer_index = (int)((lower_layer_random - lower_layer_random / 10000.0f) * (current_network.layer_num - 1)); int target_layer_index = source_layer_index + 1 + (int)((upper_layer_random - upper_layer_random / 10000.0f) * (current_network.layer_num - source_layer_index - 1)); Layer& source_layer = current_network.layers[source_layer_index]; Layer& target_layer = current_network.layers[target_layer_index]; if (source_layer.num_neurons == 0 || target_layer.num_neurons == 0) continue; float source_neuron_random = curand_uniform(&random_state); float target_neuron_random = curand_uniform(&random_state); int source_neuron_index = (int)((source_neuron_random - source_neuron_random / 10000.0f) * source_layer.num_neurons); int target_neuron_index = (int)((target_neuron_random - target_neuron_random / 10000.0f) * target_layer.num_neurons); Neuron& source_neuron = source_layer.Neurons[source_neuron_index]; Neuron& target_neuron = target_layer.Neurons[target_neuron_index]; bool connection_exists = false; for (int conn_idx = 0; conn_idx < source_layer.num_connections; ++conn_idx) { if (source_layer.Connections[conn_idx].from == source_neuron.id && source_layer.Connections[conn_idx].to == target_neuron.id) { connection_exists = true; break; } } if (connection_exists) continue; int new_connection_index = source_layer.num_connections; Connection* expanded_connections; cudaError_t memory_error = cudaMalloc(&expanded_connections, (new_connection_index + 1) * sizeof(Connection)); if (memory_error != cudaSuccess) { printf("[ERROR] cudaMalloc expanded_connections failed: %s\n", cudaGetErrorString(memory_error)); return; } for (int conn_idx = 0; conn_idx < new_connection_index; ++conn_idx) { expanded_connections[conn_idx] = source_layer.Connections[conn_idx]; } expanded_connections[new_connection_index].from = source_neuron.id; expanded_connections[new_connection_index].to = target_neuron.id; expanded_connections[new_connection_index].weight = (2.0f * curand_uniform(&random_state)) - 1; cudaFree(source_layer.Connections); source_layer.Connections = expanded_connections; source_layer.num_connections++; break; } } for (int layer_index = 0; layer_index < current_network.layer_num; ++layer_index) { for (int neuron_index = 0; neuron_index < current_network.layers[layer_index].num_neurons; ++neuron_index) { if (curand_uniform(&random_state) < neuron_mutation_probability) { float bias_delta = curand_uniform(&random_state) * bias_mutation_rate * current_network.layers[layer_index].Neurons[neuron_index].bias; current_network.layers[layer_index].Neurons[neuron_index].bias += (2.0f * bias_delta) - bias_mutation_rate * current_network.layers[layer_index].Neurons[neuron_index].bias; } } for (int connection_index = 0; connection_index < current_network.layers[layer_index].num_connections; ++connection_index) { if (curand_uniform(&random_state) < connection_mutation_probability) { float weight_delta = curand_uniform(&random_state) * weight_mutation_rate * current_network.layers[layer_index].Connections[connection_index].weight; current_network.layers[layer_index].Connections[connection_index].weight += (2.0f * weight_delta) - weight_mutation_rate * current_network.layers[layer_index].Connections[connection_index].weight; } } } } int main() { int THREADS_PER_BLOCK = 128; float NEURON_ADD_PROB = 0.45; float CONNECTION_ADD_PROB = 0.45; float CONNECTION_MUTATE_PROB = 0.425; float NEURON_MUTATE_PROB = 0.45; float WEIGHT_MUTATE_RATE = 0.8; float BIAS_MUTATE_RATE = 0.8; int POPULATION_SIZE = 32; int HEAP_SIZE = 2; int population_size = POPULATION_SIZE; int output_num = 3; int input_num = 390; cudaDeviceSetLimit(cudaLimitMallocHeapSize, (size_t)(HEAP_SIZE) * 1024 * 1024 * 1024); cudaDeviceSetLimit(cudaLimitStackSize, 32768); std::cout << "[+] Heap size successfully set to " << HEAP_SIZE << " GB\n"; Population h_population; h_population.generation_id = 1; gpuErrchk(cudaMalloc(&h_population.Networks, population_size * sizeof(Network))); Network* d_networks; gpuErrchk(cudaMalloc(&d_networks, population_size * sizeof(Network))); cudaStream_t stream1, stream2, stream3, stream4; cudaStreamCreate(&stream1); cudaStreamCreate(&stream2); cudaStreamCreate(&stream3); cudaStreamCreate(&stream4); for (int i = 0; i < population_size; i++) { Network h_net; h_net.layer_num = 2; h_net.num_neurons = input_num + output_num; Layer* d_layers; gpuErrchk(cudaMalloc(&d_layers, 2 * sizeof(Layer))); h_net.layers = d_layers; Layer h_layers[2]; gpuErrchk(cudaMalloc(&h_layers[0].Neurons, input_num * sizeof(Neuron))); gpuErrchk(cudaMalloc(&h_layers[0].Connections, input_num * output_num * sizeof(Connection))); gpuErrchk(cudaMalloc(&h_layers[1].Neurons, output_num * sizeof(Neuron))); h_layers[1].Connections = nullptr; h_layers[0].num_neurons = input_num; h_layers[0].num_connections = input_num * output_num; h_layers[1].num_neurons = output_num; h_layers[1].num_connections = 0; gpuErrchk(cudaMemcpyAsync(d_layers, h_layers, 2 * sizeof(Layer), cudaMemcpyHostToDevice, stream1)); gpuErrchk(cudaMemcpyAsync(&d_networks[i], &h_net, sizeof(Network), cudaMemcpyHostToDevice, stream1)); } cudaStreamSynchronize(stream1); int threadsPerBlock = THREADS_PER_BLOCK; int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock; CreateBasePopulation<<<blocks, threadsPerBlock>>>(d_networks, population_size, input_num, output_num); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_population.Networks, d_networks, population_size * sizeof(Network), cudaMemcpyDeviceToDevice)); std::cout << "[+] Population created successfully!\n"; for (int generation = 1; generation <= 100; ++generation) { Mutate<<<blocks, threadsPerBlock>>>(d_networks, (input_num + output_num), population_size, NEURON_ADD_PROB, NEURON_MUTATE_PROB, CONNECTION_ADD_PROB, CONNECTION_MUTATE_PROB, WEIGHT_MUTATE_RATE, BIAS_MUTATE_RATE); gpuErrchk(cudaDeviceSynchronize()); } }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
About the author
Statistics
Changelog
Version tree