CUDA Guide¶
Applies to: CUDA 11+, GPU Computing, Deep Learning, Scientific Computing
Core Principles¶
- Parallelism First: Design for thousands of concurrent threads
- Memory Hierarchy: Optimize for global, shared, and register memory
- Coalesced Access: Align memory access patterns for bandwidth
- Occupancy: Maximize GPU utilization
- Host-Device Coordination: Minimize data transfers
Language-Specific Guardrails¶
CUDA Version & Setup¶
- ✓ Use CUDA 11.0+ (12.x recommended for new projects)
- ✓ Use CMake with CUDA language support
- ✓ Target appropriate compute capability for hardware
- ✓ Use CUDA Toolkit matching driver version
- ✓ Enable
-arch=nativeor specific-arch=sm_XXfor optimization
Code Style¶
- ✓ Use
__global__for kernel functions - ✓ Use
__device__for device-only functions - ✓ Use
__host__ __device__for functions callable from both - ✓ Prefix kernel names descriptively:
kernel_,k_ - ✓ Use
snake_casefor variables and functions - ✓ Use
PascalCasefor classes and structs - ✓ Document thread/block dimensions in kernel comments
Memory Management¶
- ✓ Use CUDA Unified Memory (
cudaMallocManaged) for simplicity - ✓ Use explicit
cudaMalloc/cudaMemcpyfor performance-critical code - ✓ Always check CUDA API return values
- ✓ Use
cudaFreefor everycudaMalloc - ✓ Prefer
cudaMemcpyAsyncwith streams for overlap - ✓ Use pinned memory (
cudaMallocHost) for faster transfers
Error Handling¶
- ✓ Check every CUDA API call with
cudaGetLastError() - ✓ Use error checking macro for all CUDA calls
- ✓ Handle out-of-memory errors gracefully
- ✓ Synchronize before checking kernel errors
- ✓ Use
cuda-memcheckduring development
Thread Organization¶
- ✓ Choose block size as multiple of warp size (32)
- ✓ Common block sizes: 128, 256, 512 threads
- ✓ Ensure thread count covers all data elements
- ✓ Avoid divergent branches within warps
- ✓ Use grid-stride loops for flexible sizing
Project Structure¶
CMake CUDA Project¶
myproject/
├── CMakeLists.txt
├── include/
│ └── myproject/
│ ├── kernels.cuh # Kernel declarations
│ └── cuda_utils.cuh # Utilities
├── src/
│ ├── main.cpp # Host code
│ ├── kernels.cu # Kernel implementations
│ └── cuda_utils.cu
├── tests/
│ └── test_kernels.cu
└── README.md
CMakeLists.txt¶
cmake_minimum_required(VERSION 3.18)
project(myproject LANGUAGES CXX CUDA)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
# Find CUDA
find_package(CUDAToolkit REQUIRED)
# Set architecture (adjust for your GPU)
set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)
# Enable separable compilation for device linking
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
# Library
add_library(myproject
src/kernels.cu
src/cuda_utils.cu
)
target_include_directories(myproject PUBLIC include)
# Executable
add_executable(main src/main.cpp)
target_link_libraries(main myproject CUDA::cudart)
# Tests
enable_testing()
add_executable(tests tests/test_kernels.cu)
target_link_libraries(tests myproject CUDA::cudart)
Basic CUDA Patterns¶
Error Checking Macro¶
#include <cuda_runtime.h>
#include <cstdio>
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define CUDA_CHECK_LAST() \
do { \
cudaError_t error = cudaGetLastError(); \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA kernel error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while (0)
Simple Kernel¶
// Vector addition kernel
__global__ void vector_add(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
// Launch kernel
void launch_vector_add(const float* a, const float* b, float* c, int n) {
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
vector_add<<<grid_size, block_size>>>(a, b, c, n);
CUDA_CHECK_LAST();
}
Grid-Stride Loop Pattern¶
// Handles any size array efficiently
__global__ void vector_add_stride(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < n; i += stride) {
c[i] = a[i] + b[i];
}
}
// Can use fixed grid size
void launch_vector_add_stride(const float* a, const float* b, float* c, int n) {
int block_size = 256;
int grid_size = 256; // Fixed, handles any n
vector_add_stride<<<grid_size, block_size>>>(a, b, c, n);
CUDA_CHECK_LAST();
}
Memory Management¶
Explicit Memory Management¶
#include <cuda_runtime.h>
void example_explicit_memory() {
const int n = 1000000;
const size_t size = n * sizeof(float);
// Host memory
float* h_a = new float[n];
float* h_b = new float[n];
float* h_c = new float[n];
// Initialize host data
for (int i = 0; i < n; i++) {
h_a[i] = 1.0f;
h_b[i] = 2.0f;
}
// Device memory
float *d_a, *d_b, *d_c;
CUDA_CHECK(cudaMalloc(&d_a, size));
CUDA_CHECK(cudaMalloc(&d_b, size));
CUDA_CHECK(cudaMalloc(&d_c, size));
// Copy to device
CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));
// Launch kernel
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
vector_add<<<grid_size, block_size>>>(d_a, d_b, d_c, n);
CUDA_CHECK_LAST();
// Copy back
CUDA_CHECK(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));
// Cleanup
CUDA_CHECK(cudaFree(d_a));
CUDA_CHECK(cudaFree(d_b));
CUDA_CHECK(cudaFree(d_c));
delete[] h_a;
delete[] h_b;
delete[] h_c;
}
Unified Memory¶
void example_unified_memory() {
const int n = 1000000;
const size_t size = n * sizeof(float);
// Unified memory (accessible from host and device)
float *a, *b, *c;
CUDA_CHECK(cudaMallocManaged(&a, size));
CUDA_CHECK(cudaMallocManaged(&b, size));
CUDA_CHECK(cudaMallocManaged(&c, size));
// Initialize on host (no explicit copy needed)
for (int i = 0; i < n; i++) {
a[i] = 1.0f;
b[i] = 2.0f;
}
// Launch kernel
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
vector_add<<<grid_size, block_size>>>(a, b, c, n);
// Wait for GPU to finish
CUDA_CHECK(cudaDeviceSynchronize());
// Access result on host (no explicit copy)
printf("Result: %f\n", c[0]);
// Cleanup
CUDA_CHECK(cudaFree(a));
CUDA_CHECK(cudaFree(b));
CUDA_CHECK(cudaFree(c));
}
Pinned Memory for Fast Transfers¶
void example_pinned_memory() {
const int n = 1000000;
const size_t size = n * sizeof(float);
// Pinned (page-locked) host memory
float* h_a;
CUDA_CHECK(cudaMallocHost(&h_a, size));
float* d_a;
CUDA_CHECK(cudaMalloc(&d_a, size));
// Faster transfer with pinned memory
CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
// Cleanup
CUDA_CHECK(cudaFree(d_a));
CUDA_CHECK(cudaFreeHost(h_a));
}
Shared Memory¶
Basic Shared Memory Usage¶
__global__ void reduce_sum(const float* input, float* output, int n) {
extern __shared__ float sdata[];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Load to shared memory
sdata[tid] = (idx < n) ? input[idx] : 0.0f;
__syncthreads();
// Reduction in shared memory
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// Write result
if (tid == 0) {
output[blockIdx.x] = sdata[0];
}
}
// Launch with dynamic shared memory
void launch_reduce(const float* input, float* output, int n) {
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
size_t shared_size = block_size * sizeof(float);
reduce_sum<<<grid_size, block_size, shared_size>>>(input, output, n);
CUDA_CHECK_LAST();
}
Tiled Matrix Multiplication¶
#define TILE_SIZE 16
__global__ void matrix_multiply_tiled(
const float* A, const float* B, float* C,
int M, int N, int K
) {
__shared__ float As[TILE_SIZE][TILE_SIZE];
__shared__ float Bs[TILE_SIZE][TILE_SIZE];
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
float sum = 0.0f;
for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
// Load tiles into shared memory
int a_col = t * TILE_SIZE + threadIdx.x;
int b_row = t * TILE_SIZE + threadIdx.y;
As[threadIdx.y][threadIdx.x] =
(row < M && a_col < K) ? A[row * K + a_col] : 0.0f;
Bs[threadIdx.y][threadIdx.x] =
(b_row < K && col < N) ? B[b_row * N + col] : 0.0f;
__syncthreads();
// Compute partial dot product
for (int k = 0; k < TILE_SIZE; k++) {
sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
}
__syncthreads();
}
if (row < M && col < N) {
C[row * N + col] = sum;
}
}
Streams and Async Operations¶
Concurrent Kernels with Streams¶
void example_streams() {
const int n = 1000000;
const size_t size = n * sizeof(float);
const int num_streams = 4;
// Create streams
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++) {
CUDA_CHECK(cudaStreamCreate(&streams[i]));
}
// Allocate memory
float *h_a, *d_a;
CUDA_CHECK(cudaMallocHost(&h_a, size * num_streams));
CUDA_CHECK(cudaMalloc(&d_a, size * num_streams));
// Launch operations in parallel streams
for (int i = 0; i < num_streams; i++) {
size_t offset = i * n;
// Async copy
CUDA_CHECK(cudaMemcpyAsync(
d_a + offset, h_a + offset, size,
cudaMemcpyHostToDevice, streams[i]
));
// Kernel in stream
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
some_kernel<<<grid_size, block_size, 0, streams[i]>>>(
d_a + offset, n
);
// Async copy back
CUDA_CHECK(cudaMemcpyAsync(
h_a + offset, d_a + offset, size,
cudaMemcpyDeviceToHost, streams[i]
));
}
// Wait for all streams
CUDA_CHECK(cudaDeviceSynchronize());
// Cleanup
for (int i = 0; i < num_streams; i++) {
CUDA_CHECK(cudaStreamDestroy(streams[i]));
}
CUDA_CHECK(cudaFreeHost(h_a));
CUDA_CHECK(cudaFree(d_a));
}
Events for Timing¶
void benchmark_kernel() {
cudaEvent_t start, stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
// Record start
CUDA_CHECK(cudaEventRecord(start));
// Launch kernel
some_kernel<<<grid_size, block_size>>>(args);
// Record stop
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
// Calculate elapsed time
float milliseconds = 0;
CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Kernel time: %.3f ms\n", milliseconds);
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}
Atomic Operations¶
Common Atomics¶
__global__ void histogram(const int* data, int* hist, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
int bin = data[idx];
atomicAdd(&hist[bin], 1);
}
}
__global__ void find_max(const float* data, float* result, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
// atomicMax for floats (CUDA 11+)
atomicMax((int*)result, __float_as_int(data[idx]));
}
}
__global__ void cas_example(int* data) {
int idx = threadIdx.x;
// Compare-and-swap
int old = atomicCAS(&data[0], 0, idx);
// Only one thread succeeds in setting value
}
Shared Memory Atomics (Faster)¶
__global__ void histogram_shared(const int* data, int* hist, int n, int num_bins) {
extern __shared__ int s_hist[];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize shared histogram
for (int i = tid; i < num_bins; i += blockDim.x) {
s_hist[i] = 0;
}
__syncthreads();
// Accumulate in shared memory
if (idx < n) {
atomicAdd(&s_hist[data[idx]], 1);
}
__syncthreads();
// Write to global memory
for (int i = tid; i < num_bins; i += blockDim.x) {
atomicAdd(&hist[i], s_hist[i]);
}
}
Thrust Library¶
Using Thrust for High-Level Operations¶
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
void example_thrust() {
// Host vector
thrust::host_vector<float> h_vec(1000000);
for (int i = 0; i < h_vec.size(); i++) {
h_vec[i] = static_cast<float>(rand()) / RAND_MAX;
}
// Copy to device
thrust::device_vector<float> d_vec = h_vec;
// Sort
thrust::sort(d_vec.begin(), d_vec.end());
// Reduce (sum)
float sum = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::plus<float>());
printf("Sum: %f\n", sum);
// Transform
thrust::transform(
d_vec.begin(), d_vec.end(),
d_vec.begin(),
thrust::negate<float>()
);
// Custom functor
struct square_functor {
__host__ __device__
float operator()(float x) const {
return x * x;
}
};
thrust::transform(
d_vec.begin(), d_vec.end(),
d_vec.begin(),
square_functor()
);
// Copy back
h_vec = d_vec;
}
Performance Optimization¶
Optimization Guardrails¶
- ✓ Maximize occupancy (threads per SM)
- ✓ Coalesce global memory accesses
- ✓ Use shared memory for reused data
- ✓ Avoid warp divergence
- ✓ Minimize host-device transfers
- ✓ Use streams for overlap
- ✓ Profile with Nsight Compute
Memory Coalescing¶
// BAD: Strided access (not coalesced)
__global__ void bad_access(float* data, int stride, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx * stride] = 1.0f; // Non-coalesced!
}
}
// GOOD: Contiguous access (coalesced)
__global__ void good_access(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] = 1.0f; // Coalesced
}
}
// For 2D arrays: access by row (contiguous)
__global__ void row_access(float* matrix, int rows, int cols) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < rows && col < cols) {
// Row-major: contiguous within warp if threads vary in col
matrix[row * cols + col] = 1.0f;
}
}
Avoiding Warp Divergence¶
// BAD: Divergent branch
__global__ void divergent_kernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
if (idx % 2 == 0) {
data[idx] = expensive_operation_a(idx);
} else {
data[idx] = expensive_operation_b(idx);
}
}
}
// BETTER: Process separately or use predication
__global__ void less_divergent_kernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
// Both computed, compiler may optimize
float a = expensive_operation_a(idx);
float b = expensive_operation_b(idx);
data[idx] = (idx % 2 == 0) ? a : b;
}
}
Testing¶
Simple Test Framework¶
#include <cuda_runtime.h>
#include <cstdio>
#include <cmath>
#define ASSERT_NEAR(expected, actual, epsilon) \
do { \
if (fabs((expected) - (actual)) > (epsilon)) { \
fprintf(stderr, "FAILED: %s:%d: expected %f, got %f\n", \
__FILE__, __LINE__, (expected), (actual)); \
return false; \
} \
} while (0)
bool test_vector_add() {
const int n = 1000;
float *a, *b, *c;
CUDA_CHECK(cudaMallocManaged(&a, n * sizeof(float)));
CUDA_CHECK(cudaMallocManaged(&b, n * sizeof(float)));
CUDA_CHECK(cudaMallocManaged(&c, n * sizeof(float)));
// Initialize
for (int i = 0; i < n; i++) {
a[i] = 1.0f;
b[i] = 2.0f;
}
// Run kernel
launch_vector_add(a, b, c, n);
CUDA_CHECK(cudaDeviceSynchronize());
// Verify
for (int i = 0; i < n; i++) {
ASSERT_NEAR(3.0f, c[i], 1e-5f);
}
CUDA_CHECK(cudaFree(a));
CUDA_CHECK(cudaFree(b));
CUDA_CHECK(cudaFree(c));
printf("test_vector_add PASSED\n");
return true;
}
int main() {
bool all_passed = true;
all_passed &= test_vector_add();
// Add more tests...
return all_passed ? 0 : 1;
}
Common Pitfalls¶
Don't Do This¶
// Forgetting bounds check
__global__ void bad_bounds(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = 1.0f; // Out of bounds if idx >= n!
}
// Not synchronizing before checking errors
kernel<<<grid, block>>>(args);
// cudaGetLastError() here won't catch async kernel errors
// Memory leak
float* d_ptr;
cudaMalloc(&d_ptr, size);
// Forgot cudaFree(d_ptr)!
// Race condition in shared memory
__global__ void race_condition() {
__shared__ float shared_data[256];
shared_data[threadIdx.x] = threadIdx.x;
// Missing __syncthreads()!
float val = shared_data[threadIdx.x + 1]; // Race!
}
Do This Instead¶
// Always check bounds
__global__ void good_bounds(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] = 1.0f;
}
}
// Synchronize before checking errors
kernel<<<grid, block>>>(args);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK_LAST();
// RAII wrapper for CUDA memory
template<typename T>
class DeviceBuffer {
public:
explicit DeviceBuffer(size_t count) : size_(count * sizeof(T)) {
CUDA_CHECK(cudaMalloc(&ptr_, size_));
}
~DeviceBuffer() {
cudaFree(ptr_);
}
T* get() { return ptr_; }
// Non-copyable
DeviceBuffer(const DeviceBuffer&) = delete;
DeviceBuffer& operator=(const DeviceBuffer&) = delete;
private:
T* ptr_;
size_t size_;
};
// Proper synchronization
__global__ void no_race() {
__shared__ float shared_data[256];
shared_data[threadIdx.x] = threadIdx.x;
__syncthreads(); // Wait for all threads
float val = shared_data[(threadIdx.x + 1) % blockDim.x];
}
Debugging Tools¶
cuda-memcheck¶
# Check for memory errors
cuda-memcheck ./program
# Check for race conditions
cuda-memcheck --tool racecheck ./program
# Check for initialization errors
cuda-memcheck --tool initcheck ./program
# Memory leak detection
cuda-memcheck --leak-check full ./program
Nsight Compute (Profiling)¶
# Profile kernel
ncu ./program
# Profile specific kernel
ncu --kernel-name "my_kernel" ./program
# Generate report
ncu -o report ./program
ncu -i report.ncu-rep # View report
Device Query¶
void print_device_info() {
int device;
cudaGetDevice(&device);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
printf("Device: %s\n", props.name);
printf("Compute capability: %d.%d\n", props.major, props.minor);
printf("Total global memory: %.2f GB\n",
props.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
printf("Shared memory per block: %zu KB\n",
props.sharedMemPerBlock / 1024);
printf("Max threads per block: %d\n", props.maxThreadsPerBlock);
printf("Warp size: %d\n", props.warpSize);
printf("Max grid size: (%d, %d, %d)\n",
props.maxGridSize[0], props.maxGridSize[1], props.maxGridSize[2]);
}