Documentation Index
Fetch the complete documentation index at: https://mintlify.com/MicrosoftDocs/cpp-docs/llms.txt
Use this file to discover all available pages before exploring further.
OpenMP (Open Multi-Processing) is a portable, standards-based API for shared-memory parallel programming in C, C++, and Fortran. Visual C++ supports the OpenMP 2.0 specification via the /openmp compiler flag. OpenMP uses compiler directives (#pragma omp) to annotate existing serial code with parallelism hints, making it the easiest way to add multi-threading to existing numerical and scientific code without restructuring. The MSVC compiler processes these pragmas to generate multi-threaded code; if compiled without /openmp, the pragmas are silently ignored and the code runs serially.
Enabling OpenMP
Visual Studio IDE
Command Line
CMake
Project Properties → C/C++ → Language → OpenMP Support → Yes (/openmp)Or for the newer experimental OpenMP LLVM runtime:OpenMP Support → Yes with LLVM runtime (/openmp:llvm)
cl /O2 /openmp myprogram.cpp
# With LLVM runtime (experimental, supports newer OpenMP features):
cl /O2 /openmp:llvm myprogram.cpp
find_package(OpenMP REQUIRED)
target_link_libraries(my_target PRIVATE OpenMP::OpenMP_CXX)
target_compile_options(my_target PRIVATE /openmp)
Once enabled, the predefined macro _OPENMP is defined (value = 200203 for OpenMP 2.0).
Core Directives
#pragma omp parallel — Fork-Join Parallelism
The parallel directive creates a parallel region: the master thread forks into a team of threads, all executing the enclosed block simultaneously, then join at the closing brace.
#include <omp.h>
#include <stdio.h>
int main(void) {
#pragma omp parallel
{
int tid = omp_get_thread_num();
int total = omp_get_num_threads();
printf("Hello from thread %d of %d\n", tid, total);
}
// All threads join here
printf("Back to serial execution.\n");
return 0;
}
#pragma omp for — Parallel Loop
The for directive distributes loop iterations among the thread team. It must appear inside a parallel region (or combined as parallel for):
#include <omp.h>
#include <stdio.h>
#include <math.h>
int main(void) {
const int N = 1000000;
double* result = (double*)malloc(N * sizeof(double));
// Combined parallel + for — most common pattern
#pragma omp parallel for schedule(static)
for (int i = 0; i < N; i++) {
result[i] = sqrt((double)i);
}
printf("result[500000] = %.6f\n", result[500000]);
free(result);
return 0;
}
Schedule clauses control how iterations are divided:
| Schedule | Description | Best for |
|---|
static | Even chunks assigned up front (default) | Equal-duration iterations |
dynamic | Iterations assigned on demand as threads finish | Uneven workloads |
guided | Decreasing chunk sizes assigned dynamically | Mix of short and long iterations |
auto | Compiler/runtime chooses | General use |
// Dynamic scheduling — good when iterations have varying cost
#pragma omp parallel for schedule(dynamic, 32)
for (int i = 0; i < N; i++) {
process_item(i); // Variable cost per i
}
reduction Clause
The reduction clause safely combines per-thread partial results into a single value at the end of the parallel region:
#include <omp.h>
#include <stdio.h>
int main(void) {
const int N = 1000;
long long sum = 0;
double product_check = 1.0;
// Each thread gets its own private copy of 'sum'
// They are combined with '+' at the end
#pragma omp parallel for reduction(+:sum) reduction(*:product_check)
for (int i = 1; i <= N; i++) {
sum += i;
if (i <= 5) product_check *= i; // Small range to avoid overflow
}
printf("Sum 1..%d = %lld\n", N, sum); // 500500
printf("Product 1..5 = %.0f\n", product_check); // 120
return 0;
}
Supported reduction operators: +, *, -, &, |, ^, &&, ||.
#pragma omp sections — Parallel Tasks
sections assigns independent code blocks to different threads — useful for running distinct tasks in parallel:
#include <omp.h>
#include <stdio.h>
void load_database(void) { printf("Thread %d: loading DB\n", omp_get_thread_num()); }
void init_network(void) { printf("Thread %d: initializing network\n", omp_get_thread_num()); }
void load_config(void) { printf("Thread %d: loading config\n", omp_get_thread_num()); }
int main(void) {
#pragma omp parallel sections
{
#pragma omp section
load_database();
#pragma omp section
init_network();
#pragma omp section
load_config();
}
printf("All initialization complete.\n");
return 0;
}
#pragma omp single — One Thread Executes
Within a parallel region, single ensures only one thread executes the block (the first one to reach it). Others wait at an implicit barrier unless nowait is specified.
#pragma omp parallel
{
#pragma omp single
{
printf("Thread %d initializes shared resource\n", omp_get_thread_num());
// Only executed by one thread
}
// All threads resume here (after barrier)
do_parallel_work();
}
#pragma omp critical — Mutual Exclusion
critical protects a block so only one thread executes it at a time — equivalent to a mutex lock:
#include <omp.h>
int shared_counter = 0;
int main(void) {
#pragma omp parallel for
for (int i = 0; i < 10000; i++) {
#pragma omp critical (counter_update)
{
shared_counter++; // Protected — only one thread at a time
}
}
printf("Counter = %d\n", shared_counter); // Always 10000
return 0;
}
critical sections serialize execution. Use them only when necessary — they are the primary source of OpenMP performance degradation. For simple integer updates, prefer atomic.
#pragma omp atomic — Lightweight Atomic Updates
atomic provides a lighter-weight alternative to critical for simple read-modify-write operations on scalar variables:
int counter = 0;
double accumulator = 0.0;
#pragma omp parallel for
for (int i = 0; i < 100000; i++) {
#pragma omp atomic
counter++; // Atomic increment
#pragma omp atomic
accumulator += 0.5; // Atomic floating-point add
}
Data Scoping Clauses
Control which variables are shared or private across the thread team:
int shared_var = 0;
int my_private;
#pragma omp parallel for \
shared(shared_var) /* all threads see same copy */ \
private(my_private) /* each thread gets its own copy (uninitialized) */ \
firstprivate(some_val) /* private, initialized from master thread's value */ \
lastprivate(loop_var) /* private; master gets the final iteration's value */
for (int i = 0; i < N; i++) {
my_private = compute(i);
// shared_var must be protected if written
}
Environment Variables
OpenMP runtime behavior can be controlled through environment variables set before program launch:
# Set the number of threads for parallel regions
set OMP_NUM_THREADS=8
# Binding/placement of threads (OpenMP 4.0+, /openmp:llvm)
set OMP_PROC_BIND=close
# Schedule for loops without explicit schedule clause
set OMP_SCHEDULE=dynamic,64
# Stack size per thread (bytes)
set OMP_STACKSIZE=16M
Runtime Library Functions
#include <omp.h>
// Query thread count and ID
int total_threads = omp_get_num_threads(); // In parallel region
int thread_id = omp_get_thread_num(); // 0 = master
int max_threads = omp_get_max_threads(); // Outside parallel region
// Set thread count programmatically
omp_set_num_threads(4);
// Timing (wall clock, seconds)
double start = omp_get_wtime();
do_work();
double elapsed = omp_get_wtime() - start;
// Query if running in a parallel region
int in_parallel = omp_in_parallel(); // 0 or 1
// Locking primitives
omp_lock_t my_lock;
omp_init_lock(&my_lock);
omp_set_lock(&my_lock);
// critical section
omp_unset_lock(&my_lock);
omp_destroy_lock(&my_lock);
Complete Example: Parallel Matrix Multiplication
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define N 512
void matrix_multiply(double A[N][N], double B[N][N], double C[N][N]) {
// Collapse both outer loops into a single parallel iteration space
#pragma omp parallel for collapse(2) schedule(static)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
double sum = 0.0;
for (int k = 0; k < N; k++) {
sum += A[i][k] * B[k][j];
}
C[i][j] = sum;
}
}
}
int main(void) {
static double A[N][N], B[N][N], C[N][N];
// Initialize matrices
#pragma omp parallel for collapse(2)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
A[i][j] = (double)(i + j) / N;
B[i][j] = (double)(i * j + 1) / N;
}
double t0 = omp_get_wtime();
matrix_multiply(A, B, C);
double t1 = omp_get_wtime();
printf("Matrix multiplication (%dx%d) with %d threads: %.3f seconds\n",
N, N, omp_get_max_threads(), t1 - t0);
printf("C[N/2][N/2] = %.6f\n", C[N/2][N/2]);
return 0;
}
The collapse(n) clause merges n nested loops into a single parallel iteration space, giving the scheduler more granularity to distribute work — especially useful when either outer loop has a small iteration count.