CUDA Guide¶

Applies to: CUDA 11+, GPU Computing, Deep Learning, Scientific Computing

Core Principles¶

Parallelism First: Design for thousands of concurrent threads
Memory Hierarchy: Optimize for global, shared, and register memory
Coalesced Access: Align memory access patterns for bandwidth
Occupancy: Maximize GPU utilization
Host-Device Coordination: Minimize data transfers

Language-Specific Guardrails¶

CUDA Version & Setup¶

✓ Use CUDA 11.0+ (12.x recommended for new projects)
✓ Use CMake with CUDA language support
✓ Target appropriate compute capability for hardware
✓ Use CUDA Toolkit matching driver version
✓ Enable -arch=native or specific -arch=sm_XX for optimization

Code Style¶

✓ Use __global__ for kernel functions
✓ Use __device__ for device-only functions
✓ Use __host__ __device__ for functions callable from both
✓ Prefix kernel names descriptively: kernel_, k_
✓ Use snake_case for variables and functions
✓ Use PascalCase for classes and structs
✓ Document thread/block dimensions in kernel comments

Memory Management¶

✓ Use CUDA Unified Memory (cudaMallocManaged) for simplicity
✓ Use explicit cudaMalloc/cudaMemcpy for performance-critical code
✓ Always check CUDA API return values
✓ Use cudaFree for every cudaMalloc
✓ Prefer cudaMemcpyAsync with streams for overlap
✓ Use pinned memory (cudaMallocHost) for faster transfers

Error Handling¶

✓ Check every CUDA API call with cudaGetLastError()
✓ Use error checking macro for all CUDA calls
✓ Handle out-of-memory errors gracefully
✓ Synchronize before checking kernel errors
✓ Use cuda-memcheck during development

Thread Organization¶

✓ Choose block size as multiple of warp size (32)
✓ Common block sizes: 128, 256, 512 threads
✓ Ensure thread count covers all data elements
✓ Avoid divergent branches within warps
✓ Use grid-stride loops for flexible sizing

Project Structure¶

CMake CUDA Project¶

myproject/
├── CMakeLists.txt
├── include/
│   └── myproject/
│       ├── kernels.cuh           # Kernel declarations
│       └── cuda_utils.cuh        # Utilities
├── src/
│   ├── main.cpp                  # Host code
│   ├── kernels.cu                # Kernel implementations
│   └── cuda_utils.cu
├── tests/
│   └── test_kernels.cu
└── README.md

CMakeLists.txt¶

cmake_minimum_required(VERSION 3.18)
project(myproject LANGUAGES CXX CUDA)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)

# Find CUDA
find_package(CUDAToolkit REQUIRED)

# Set architecture (adjust for your GPU)
set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)

# Enable separable compilation for device linking
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

# Library
add_library(myproject
    src/kernels.cu
    src/cuda_utils.cu
)

target_include_directories(myproject PUBLIC include)

# Executable
add_executable(main src/main.cpp)
target_link_libraries(main myproject CUDA::cudart)

# Tests
enable_testing()
add_executable(tests tests/test_kernels.cu)
target_link_libraries(tests myproject CUDA::cudart)

Basic CUDA Patterns¶

Error Checking Macro¶

#include <cuda_runtime.h>
#include <cstdio>

#define CUDA_CHECK(call)                                                    \
    do {                                                                    \
        cudaError_t error = call;                                          \
        if (error != cudaSuccess) {                                        \
            fprintf(stderr, "CUDA error at %s:%d: %s\n",                   \
                    __FILE__, __LINE__, cudaGetErrorString(error));        \
            exit(EXIT_FAILURE);                                            \
        }                                                                   \
    } while (0)

#define CUDA_CHECK_LAST()                                                   \
    do {                                                                    \
        cudaError_t error = cudaGetLastError();                            \
        if (error != cudaSuccess) {                                        \
            fprintf(stderr, "CUDA kernel error at %s:%d: %s\n",            \
                    __FILE__, __LINE__, cudaGetErrorString(error));        \
            exit(EXIT_FAILURE);                                            \
        }                                                                   \
    } while (0)

Simple Kernel¶

// Vector addition kernel
__global__ void vector_add(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

// Launch kernel
void launch_vector_add(const float* a, const float* b, float* c, int n) {
    int block_size = 256;
    int grid_size = (n + block_size - 1) / block_size;

    vector_add<<<grid_size, block_size>>>(a, b, c, n);
    CUDA_CHECK_LAST();
}

Grid-Stride Loop Pattern¶

// Handles any size array efficiently
__global__ void vector_add_stride(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = idx; i < n; i += stride) {
        c[i] = a[i] + b[i];
    }
}

// Can use fixed grid size
void launch_vector_add_stride(const float* a, const float* b, float* c, int n) {
    int block_size = 256;
    int grid_size = 256;  // Fixed, handles any n

    vector_add_stride<<<grid_size, block_size>>>(a, b, c, n);
    CUDA_CHECK_LAST();
}

Memory Management¶

Explicit Memory Management¶

#include <cuda_runtime.h>

void example_explicit_memory() {
    const int n = 1000000;
    const size_t size = n * sizeof(float);

    // Host memory
    float* h_a = new float[n];
    float* h_b = new float[n];
    float* h_c = new float[n];

    // Initialize host data
    for (int i = 0; i < n; i++) {
        h_a[i] = 1.0f;
        h_b[i] = 2.0f;
    }

    // Device memory
    float *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc(&d_a, size));
    CUDA_CHECK(cudaMalloc(&d_b, size));
    CUDA_CHECK(cudaMalloc(&d_c, size));

    // Copy to device
    CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));

    // Launch kernel
    int block_size = 256;
    int grid_size = (n + block_size - 1) / block_size;
    vector_add<<<grid_size, block_size>>>(d_a, d_b, d_c, n);
    CUDA_CHECK_LAST();

    // Copy back
    CUDA_CHECK(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));

    // Cleanup
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_c));
    delete[] h_a;
    delete[] h_b;
    delete[] h_c;
}

Unified Memory¶

void example_unified_memory() {
    const int n = 1000000;
    const size_t size = n * sizeof(float);

    // Unified memory (accessible from host and device)
    float *a, *b, *c;
    CUDA_CHECK(cudaMallocManaged(&a, size));
    CUDA_CHECK(cudaMallocManaged(&b, size));
    CUDA_CHECK(cudaMallocManaged(&c, size));

    // Initialize on host (no explicit copy needed)
    for (int i = 0; i < n; i++) {
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Launch kernel
    int block_size = 256;
    int grid_size = (n + block_size - 1) / block_size;
    vector_add<<<grid_size, block_size>>>(a, b, c, n);

    // Wait for GPU to finish
    CUDA_CHECK(cudaDeviceSynchronize());

    // Access result on host (no explicit copy)
    printf("Result: %f\n", c[0]);

    // Cleanup
    CUDA_CHECK(cudaFree(a));
    CUDA_CHECK(cudaFree(b));
    CUDA_CHECK(cudaFree(c));
}

Pinned Memory for Fast Transfers¶

void example_pinned_memory() {
    const int n = 1000000;
    const size_t size = n * sizeof(float);

    // Pinned (page-locked) host memory
    float* h_a;
    CUDA_CHECK(cudaMallocHost(&h_a, size));

    float* d_a;
    CUDA_CHECK(cudaMalloc(&d_a, size));

    // Faster transfer with pinned memory
    CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));

    // Cleanup
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFreeHost(h_a));
}

Shared Memory¶

Basic Shared Memory Usage¶

__global__ void reduce_sum(const float* input, float* output, int n) {
    extern __shared__ float sdata[];

    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Load to shared memory
    sdata[tid] = (idx < n) ? input[idx] : 0.0f;
    __syncthreads();

    // Reduction in shared memory
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // Write result
    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}

// Launch with dynamic shared memory
void launch_reduce(const float* input, float* output, int n) {
    int block_size = 256;
    int grid_size = (n + block_size - 1) / block_size;
    size_t shared_size = block_size * sizeof(float);

    reduce_sum<<<grid_size, block_size, shared_size>>>(input, output, n);
    CUDA_CHECK_LAST();
}

Tiled Matrix Multiplication¶

#define TILE_SIZE 16

__global__ void matrix_multiply_tiled(
    const float* A, const float* B, float* C,
    int M, int N, int K
) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;

    float sum = 0.0f;

    for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
        // Load tiles into shared memory
        int a_col = t * TILE_SIZE + threadIdx.x;
        int b_row = t * TILE_SIZE + threadIdx.y;

        As[threadIdx.y][threadIdx.x] =
            (row < M && a_col < K) ? A[row * K + a_col] : 0.0f;
        Bs[threadIdx.y][threadIdx.x] =
            (b_row < K && col < N) ? B[b_row * N + col] : 0.0f;

        __syncthreads();

        // Compute partial dot product
        for (int k = 0; k < TILE_SIZE; k++) {
            sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
        }

        __syncthreads();
    }

    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

Streams and Async Operations¶

Concurrent Kernels with Streams¶

void example_streams() {
    const int n = 1000000;
    const size_t size = n * sizeof(float);
    const int num_streams = 4;

    // Create streams
    cudaStream_t streams[num_streams];
    for (int i = 0; i < num_streams; i++) {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }

    // Allocate memory
    float *h_a, *d_a;
    CUDA_CHECK(cudaMallocHost(&h_a, size * num_streams));
    CUDA_CHECK(cudaMalloc(&d_a, size * num_streams));

    // Launch operations in parallel streams
    for (int i = 0; i < num_streams; i++) {
        size_t offset = i * n;

        // Async copy
        CUDA_CHECK(cudaMemcpyAsync(
            d_a + offset, h_a + offset, size,
            cudaMemcpyHostToDevice, streams[i]
        ));

        // Kernel in stream
        int block_size = 256;
        int grid_size = (n + block_size - 1) / block_size;
        some_kernel<<<grid_size, block_size, 0, streams[i]>>>(
            d_a + offset, n
        );

        // Async copy back
        CUDA_CHECK(cudaMemcpyAsync(
            h_a + offset, d_a + offset, size,
            cudaMemcpyDeviceToHost, streams[i]
        ));
    }

    // Wait for all streams
    CUDA_CHECK(cudaDeviceSynchronize());

    // Cleanup
    for (int i = 0; i < num_streams; i++) {
        CUDA_CHECK(cudaStreamDestroy(streams[i]));
    }
    CUDA_CHECK(cudaFreeHost(h_a));
    CUDA_CHECK(cudaFree(d_a));
}

Events for Timing¶

void benchmark_kernel() {
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));

    // Record start
    CUDA_CHECK(cudaEventRecord(start));

    // Launch kernel
    some_kernel<<<grid_size, block_size>>>(args);

    // Record stop
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));

    // Calculate elapsed time
    float milliseconds = 0;
    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
    printf("Kernel time: %.3f ms\n", milliseconds);

    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
}

Atomic Operations¶

Common Atomics¶

__global__ void histogram(const int* data, int* hist, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        int bin = data[idx];
        atomicAdd(&hist[bin], 1);
    }
}

__global__ void find_max(const float* data, float* result, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        // atomicMax for floats (CUDA 11+)
        atomicMax((int*)result, __float_as_int(data[idx]));
    }
}

__global__ void cas_example(int* data) {
    int idx = threadIdx.x;

    // Compare-and-swap
    int old = atomicCAS(&data[0], 0, idx);
    // Only one thread succeeds in setting value
}

Shared Memory Atomics (Faster)¶

__global__ void histogram_shared(const int* data, int* hist, int n, int num_bins) {
    extern __shared__ int s_hist[];

    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Initialize shared histogram
    for (int i = tid; i < num_bins; i += blockDim.x) {
        s_hist[i] = 0;
    }
    __syncthreads();

    // Accumulate in shared memory
    if (idx < n) {
        atomicAdd(&s_hist[data[idx]], 1);
    }
    __syncthreads();

    // Write to global memory
    for (int i = tid; i < num_bins; i += blockDim.x) {
        atomicAdd(&hist[i], s_hist[i]);
    }
}

Thrust Library¶

Using Thrust for High-Level Operations¶

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>
#include <thrust/functional.h>

void example_thrust() {
    // Host vector
    thrust::host_vector<float> h_vec(1000000);
    for (int i = 0; i < h_vec.size(); i++) {
        h_vec[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    // Copy to device
    thrust::device_vector<float> d_vec = h_vec;

    // Sort
    thrust::sort(d_vec.begin(), d_vec.end());

    // Reduce (sum)
    float sum = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::plus<float>());
    printf("Sum: %f\n", sum);

    // Transform
    thrust::transform(
        d_vec.begin(), d_vec.end(),
        d_vec.begin(),
        thrust::negate<float>()
    );

    // Custom functor
    struct square_functor {
        __host__ __device__
        float operator()(float x) const {
            return x * x;
        }
    };

    thrust::transform(
        d_vec.begin(), d_vec.end(),
        d_vec.begin(),
        square_functor()
    );

    // Copy back
    h_vec = d_vec;
}

Performance Optimization¶

Optimization Guardrails¶

✓ Maximize occupancy (threads per SM)
✓ Coalesce global memory accesses
✓ Use shared memory for reused data
✓ Avoid warp divergence
✓ Minimize host-device transfers
✓ Use streams for overlap
✓ Profile with Nsight Compute

Memory Coalescing¶

// BAD: Strided access (not coalesced)
__global__ void bad_access(float* data, int stride, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx * stride] = 1.0f;  // Non-coalesced!
    }
}

// GOOD: Contiguous access (coalesced)
__global__ void good_access(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = 1.0f;  // Coalesced
    }
}

// For 2D arrays: access by row (contiguous)
__global__ void row_access(float* matrix, int rows, int cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < rows && col < cols) {
        // Row-major: contiguous within warp if threads vary in col
        matrix[row * cols + col] = 1.0f;
    }
}

Avoiding Warp Divergence¶

// BAD: Divergent branch
__global__ void divergent_kernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        if (idx % 2 == 0) {
            data[idx] = expensive_operation_a(idx);
        } else {
            data[idx] = expensive_operation_b(idx);
        }
    }
}

// BETTER: Process separately or use predication
__global__ void less_divergent_kernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        // Both computed, compiler may optimize
        float a = expensive_operation_a(idx);
        float b = expensive_operation_b(idx);
        data[idx] = (idx % 2 == 0) ? a : b;
    }
}

Testing¶

Simple Test Framework¶

#include <cuda_runtime.h>
#include <cstdio>
#include <cmath>

#define ASSERT_NEAR(expected, actual, epsilon)                    \
    do {                                                          \
        if (fabs((expected) - (actual)) > (epsilon)) {           \
            fprintf(stderr, "FAILED: %s:%d: expected %f, got %f\n", \
                    __FILE__, __LINE__, (expected), (actual));    \
            return false;                                         \
        }                                                         \
    } while (0)

bool test_vector_add() {
    const int n = 1000;
    float *a, *b, *c;

    CUDA_CHECK(cudaMallocManaged(&a, n * sizeof(float)));
    CUDA_CHECK(cudaMallocManaged(&b, n * sizeof(float)));
    CUDA_CHECK(cudaMallocManaged(&c, n * sizeof(float)));

    // Initialize
    for (int i = 0; i < n; i++) {
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Run kernel
    launch_vector_add(a, b, c, n);
    CUDA_CHECK(cudaDeviceSynchronize());

    // Verify
    for (int i = 0; i < n; i++) {
        ASSERT_NEAR(3.0f, c[i], 1e-5f);
    }

    CUDA_CHECK(cudaFree(a));
    CUDA_CHECK(cudaFree(b));
    CUDA_CHECK(cudaFree(c));

    printf("test_vector_add PASSED\n");
    return true;
}

int main() {
    bool all_passed = true;

    all_passed &= test_vector_add();
    // Add more tests...

    return all_passed ? 0 : 1;
}

Common Pitfalls¶

Don't Do This¶

// Forgetting bounds check
__global__ void bad_bounds(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    data[idx] = 1.0f;  // Out of bounds if idx >= n!
}

// Not synchronizing before checking errors
kernel<<<grid, block>>>(args);
// cudaGetLastError() here won't catch async kernel errors

// Memory leak
float* d_ptr;
cudaMalloc(&d_ptr, size);
// Forgot cudaFree(d_ptr)!

// Race condition in shared memory
__global__ void race_condition() {
    __shared__ float shared_data[256];
    shared_data[threadIdx.x] = threadIdx.x;
    // Missing __syncthreads()!
    float val = shared_data[threadIdx.x + 1];  // Race!
}

Do This Instead¶

// Always check bounds
__global__ void good_bounds(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = 1.0f;
    }
}

// Synchronize before checking errors
kernel<<<grid, block>>>(args);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK_LAST();

// RAII wrapper for CUDA memory
template<typename T>
class DeviceBuffer {
public:
    explicit DeviceBuffer(size_t count) : size_(count * sizeof(T)) {
        CUDA_CHECK(cudaMalloc(&ptr_, size_));
    }

    ~DeviceBuffer() {
        cudaFree(ptr_);
    }

    T* get() { return ptr_; }

    // Non-copyable
    DeviceBuffer(const DeviceBuffer&) = delete;
    DeviceBuffer& operator=(const DeviceBuffer&) = delete;

private:
    T* ptr_;
    size_t size_;
};

// Proper synchronization
__global__ void no_race() {
    __shared__ float shared_data[256];
    shared_data[threadIdx.x] = threadIdx.x;
    __syncthreads();  // Wait for all threads
    float val = shared_data[(threadIdx.x + 1) % blockDim.x];
}

Debugging Tools¶

cuda-memcheck¶

# Check for memory errors
cuda-memcheck ./program

# Check for race conditions
cuda-memcheck --tool racecheck ./program

# Check for initialization errors
cuda-memcheck --tool initcheck ./program

# Memory leak detection
cuda-memcheck --leak-check full ./program

Nsight Compute (Profiling)¶

# Profile kernel
ncu ./program

# Profile specific kernel
ncu --kernel-name "my_kernel" ./program

# Generate report
ncu -o report ./program
ncu -i report.ncu-rep  # View report

Device Query¶

void print_device_info() {
    int device;
    cudaGetDevice(&device);

    cudaDeviceProp props;
    cudaGetDeviceProperties(&props, device);

    printf("Device: %s\n", props.name);
    printf("Compute capability: %d.%d\n", props.major, props.minor);
    printf("Total global memory: %.2f GB\n",
           props.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
    printf("Shared memory per block: %zu KB\n",
           props.sharedMemPerBlock / 1024);
    printf("Max threads per block: %d\n", props.maxThreadsPerBlock);
    printf("Warp size: %d\n", props.warpSize);
    printf("Max grid size: (%d, %d, %d)\n",
           props.maxGridSize[0], props.maxGridSize[1], props.maxGridSize[2]);
}

CUDA Guide¶

Core Principles¶

Language-Specific Guardrails¶

CUDA Version & Setup¶

Code Style¶

Memory Management¶

Error Handling¶

Thread Organization¶

Project Structure¶

CMake CUDA Project¶

CMakeLists.txt¶

Basic CUDA Patterns¶

Error Checking Macro¶

Simple Kernel¶

Grid-Stride Loop Pattern¶

Memory Management¶

Explicit Memory Management¶

Unified Memory¶

Pinned Memory for Fast Transfers¶

Shared Memory¶

Basic Shared Memory Usage¶

Tiled Matrix Multiplication¶

Streams and Async Operations¶

Concurrent Kernels with Streams¶

Events for Timing¶

Atomic Operations¶

Common Atomics¶

Shared Memory Atomics (Faster)¶

Thrust Library¶

Using Thrust for High-Level Operations¶

Performance Optimization¶

Optimization Guardrails¶

Memory Coalescing¶

Avoiding Warp Divergence¶

Testing¶

Simple Test Framework¶

Common Pitfalls¶

Don't Do This¶

Do This Instead¶

Debugging Tools¶

cuda-memcheck¶

Nsight Compute (Profiling)¶

Device Query¶

References¶