Documentation Index
Fetch the complete documentation index at: https://mintlify.com/MicrosoftDocs/cpp-docs/llms.txt
Use this file to discover all available pages before exploring further.
Compiler intrinsics are special built-in functions that the MSVC compiler replaces with one or a few machine instructions inline, eliminating function call overhead and enabling direct access to processor capabilities not expressible in standard C or C++. Unlike inline assembly, intrinsics are recognized by the optimizer — the compiler understands their semantics, can reorder them with other instructions, and can sometimes emit more efficient alternatives based on context. Intrinsics are essential for performance-critical code paths: SIMD operations on arrays, cryptographic routines, lock-free synchronization primitives, and hardware performance counters.
#include <intrin.h> // All MSVC intrinsics (x86/x64, bit ops, CPU info)
#include <immintrin.h> // Intel SSE/AVX/AVX-512 intrinsics
#include <ammintrin.h> // AMD-specific intrinsics
// For specific SIMD levels:
#include <xmmintrin.h> // SSE (_mm_* with float, __m128)
#include <emmintrin.h> // SSE2 (_mm_* with int/double, __m128i/__m128d)
#include <immintrin.h> // AVX/AVX2 (_mm256_*, __m256/__m256i)
Including <intrin.h> on Windows pulls in all architecture-appropriate intrinsic declarations. For cross-platform code, use the specific ISA headers (<immintrin.h>) and guard with __AVX2__ or similar feature macros.
Enabling SIMD via Compiler Flags
# Enable SSE2 (default on x64, enabled for x86 /arch:SSE2)
cl /O2 /arch:SSE2 myfile.cpp
# Enable AVX (256-bit SIMD, requires Haswell/Ryzen or newer)
cl /O2 /arch:AVX myfile.cpp
# Enable AVX2 (adds integer 256-bit ops, FMA)
cl /O2 /arch:AVX2 myfile.cpp
# Enable AVX-512 (512-bit SIMD, Skylake-X / Ice Lake)
cl /O2 /arch:AVX512 myfile.cpp
Use the /Qvec-report:2 flag to see which loops were auto-vectorized and at what SIMD width.
CPU Feature Detection with __cpuid
Before using AVX or other optional instructions, check whether the current CPU supports them:
#include <intrin.h>
#include <stdio.h>
struct CPUFeatures {
bool sse41;
bool avx;
bool avx2;
bool popcnt;
};
CPUFeatures detect_cpu_features() {
CPUFeatures f = {};
int info[4]; // EAX, EBX, ECX, EDX
// Basic CPUID (leaf 0x1): ECX and EDX feature flags
__cpuid(info, 1);
f.sse41 = (info[2] & (1 << 19)) != 0; // ECX bit 19
f.avx = (info[2] & (1 << 28)) != 0; // ECX bit 28
f.popcnt = (info[2] & (1 << 23)) != 0; // ECX bit 23
// Extended features (leaf 0x7, sub-leaf 0): EBX
__cpuidex(info, 7, 0);
f.avx2 = (info[1] & (1 << 5)) != 0; // EBX bit 5
return f;
}
void print_cpu_info() {
int info[4];
char brand[49] = {};
// CPU brand string: leaves 0x80000002–0x80000004
__cpuid(info, 0x80000002);
memcpy(brand, info, 16);
__cpuid(info, 0x80000003);
memcpy(brand + 16, info, 16);
__cpuid(info, 0x80000004);
memcpy(brand + 32, info, 16);
printf("CPU: %s\n", brand);
auto f = detect_cpu_features();
printf("SSE4.1: %s\n", f.sse41 ? "Yes" : "No");
printf("AVX: %s\n", f.avx ? "Yes" : "No");
printf("AVX2: %s\n", f.avx2 ? "Yes" : "No");
printf("POPCNT: %s\n", f.popcnt ? "Yes" : "No");
}
int main() {
print_cpu_info();
return 0;
}
Bit Manipulation Intrinsics
_BitScanForward and _BitScanReverse
These intrinsics map to the BSF and BSR x86 instructions — finding the position of the lowest or highest set bit:
#include <intrin.h>
#include <stdio.h>
int main() {
unsigned long mask = 0b0001'1010'0000; // Bits 5, 6, and 8 set
unsigned long index;
// Find lowest set bit
if (_BitScanForward(&index, mask)) {
printf("Lowest set bit: %lu\n", index); // 5
}
// Find highest set bit
if (_BitScanReverse(&index, mask)) {
printf("Highest set bit: %lu\n", index); // 8
}
// 64-bit variants
unsigned __int64 mask64 = (1ULL << 63) | (1ULL << 7);
unsigned long idx64;
_BitScanForward64(&idx64, mask64);
printf("BSF64: %lu\n", idx64); // 7
_BitScanReverse64(&idx64, mask64);
printf("BSR64: %lu\n", idx64); // 63
return 0;
}
__popcnt — Population Count
Counts the number of set bits in an integer (maps to the POPCNT instruction):
#include <intrin.h>
#include <stdio.h>
int main() {
unsigned int v32 = 0b1010'1010'1010'1010;
printf("popcount(0xAAAA) = %d\n", __popcnt(v32)); // 8
unsigned __int64 v64 = 0xFFFFFFFF00000000ULL;
printf("popcnt64 = %d\n", (int)__popcnt64(v64)); // 32
return 0;
}
__rdtsc — Read Time-Stamp Counter
Maps to the RDTSC instruction — reads the processor’s cycle counter. Useful for micro-benchmarking:
#include <intrin.h>
#include <stdio.h>
int main() {
unsigned __int64 start = __rdtsc();
volatile double result = 0.0;
for (int i = 0; i < 1000000; i++)
result += (double)i * 0.001;
unsigned __int64 end = __rdtsc();
printf("Loop took %llu CPU cycles\n", end - start);
printf("Result: %f (prevents optimization)\n", result);
return 0;
}
__rdtsc counts CPU cycles, not wall-clock time. On CPUs with variable clock speeds (SpeedStep, Turbo Boost), the relationship between cycle count and real time varies. Use QueryPerformanceCounter or omp_get_wtime for wall-clock timing.
SSE Intrinsics — 128-bit SIMD
SSE operates on __m128 (four 32-bit floats), __m128d (two 64-bit doubles), or __m128i (integers):
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#include <stdio.h>
// Add two arrays of floats using SSE: processes 4 floats per cycle
void sse_add(const float* a, const float* b, float* c, int n) {
int i = 0;
// Process 4 elements at a time
for (; i <= n - 4; i += 4) {
__m128 va = _mm_loadu_ps(a + i); // Load 4 unaligned floats
__m128 vb = _mm_loadu_ps(b + i); // Load 4 unaligned floats
__m128 vc = _mm_add_ps(va, vb); // Add 4 floats in parallel
_mm_storeu_ps(c + i, vc); // Store 4 results
}
// Handle remaining elements
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
int main() {
float a[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
float b[] = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f};
float c[5];
sse_add(a, b, c, 5);
for (int i = 0; i < 5; i++)
printf("c[%d] = %.1f\n", i, c[i]); // 11, 22, 33, 44, 55
return 0;
}
AVX2 Intrinsics — 256-bit SIMD
AVX2 doubles the SIMD width to 256 bits, processing 8 floats or 4 doubles per instruction. Requires /arch:AVX2:
#include <immintrin.h> // AVX/AVX2
#include <stdio.h>
// Dot product of two float arrays using AVX2
float avx2_dot_product(const float* a, const float* b, int n) {
__m256 sum = _mm256_setzero_ps(); // Initialize accumulator to zero
int i = 0;
for (; i <= n - 8; i += 8) {
// Load 8 floats from each array
__m256 va = _mm256_loadu_ps(a + i);
__m256 vb = _mm256_loadu_ps(b + i);
// Fused multiply-add: sum += va * vb (requires /arch:AVX2)
sum = _mm256_fmadd_ps(va, vb, sum);
}
// Horizontal sum of the 8 lanes
// Store the __m256 result and sum manually
float parts[8];
_mm256_storeu_ps(parts, sum);
float total = parts[0]+parts[1]+parts[2]+parts[3]
+parts[4]+parts[5]+parts[6]+parts[7];
// Handle remaining elements
for (; i < n; i++) total += a[i] * b[i];
return total;
}
int main() {
float a[16], b[16];
for (int i = 0; i < 16; i++) { a[i] = (float)i; b[i] = 1.0f; }
printf("Dot product = %.1f\n", avx2_dot_product(a, b, 16)); // 120.0
return 0;
}
Common Intrinsic Reference
| Intrinsic | Instruction | Description |
|---|
__cpuid(info, leaf) | CPUID | Query CPU features |
_BitScanForward(&idx, mask) | BSF | Find lowest set bit |
_BitScanReverse(&idx, mask) | BSR | Find highest set bit |
__popcnt(v) | POPCNT | Count set bits |
__rdtsc() | RDTSC | Read CPU cycle counter |
_mm_add_ps(a, b) | ADDPS | Add 4 floats (SSE) |
_mm256_load_ps(ptr) | VMOVAPS | Load 8 aligned floats (AVX) |
_mm256_add_ps(a, b) | VADDPS | Add 8 floats (AVX) |
_mm256_fmadd_ps(a,b,c) | VFMADD231PS | Fused multiply-add (AVX2) |
__assume(cond) | — | Hint optimizer that cond is always true |
_ReadWriteBarrier() | — | Prevent compiler memory reorder |
__debugbreak() | INT 3 | Trigger debugger breakpoint |