MSVC Compiler Intrinsics: SIMD, Bit Ops, and CPU Info

Compiler intrinsics are special built-in functions that the MSVC compiler replaces with one or a few machine instructions inline, eliminating function call overhead and enabling direct access to processor capabilities not expressible in standard C or C++. Unlike inline assembly, intrinsics are recognized by the optimizer — the compiler understands their semantics, can reorder them with other instructions, and can sometimes emit more efficient alternatives based on context. Intrinsics are essential for performance-critical code paths: SIMD operations on arrays, cryptographic routines, lock-free synchronization primitives, and hardware performance counters.

Including Intrinsic Headers

#include <intrin.h>        // All MSVC intrinsics (x86/x64, bit ops, CPU info)
#include <immintrin.h>     // Intel SSE/AVX/AVX-512 intrinsics
#include <ammintrin.h>     // AMD-specific intrinsics

// For specific SIMD levels:
#include <xmmintrin.h>     // SSE  (_mm_* with float, __m128)
#include <emmintrin.h>     // SSE2 (_mm_* with int/double, __m128i/__m128d)
#include <immintrin.h>     // AVX/AVX2 (_mm256_*, __m256/__m256i)

Including <intrin.h> on Windows pulls in all architecture-appropriate intrinsic declarations. For cross-platform code, use the specific ISA headers (<immintrin.h>) and guard with __AVX2__ or similar feature macros.

Enabling SIMD via Compiler Flags

# Enable SSE2 (default on x64, enabled for x86 /arch:SSE2)
cl /O2 /arch:SSE2 myfile.cpp

# Enable AVX (256-bit SIMD, requires Haswell/Ryzen or newer)
cl /O2 /arch:AVX myfile.cpp

# Enable AVX2 (adds integer 256-bit ops, FMA)
cl /O2 /arch:AVX2 myfile.cpp

# Enable AVX-512 (512-bit SIMD, Skylake-X / Ice Lake)
cl /O2 /arch:AVX512 myfile.cpp

Use the /Qvec-report:2 flag to see which loops were auto-vectorized and at what SIMD width.

CPU Feature Detection with `__cpuid`

Before using AVX or other optional instructions, check whether the current CPU supports them:

#include <intrin.h>
#include <stdio.h>

struct CPUFeatures {
    bool sse41;
    bool avx;
    bool avx2;
    bool popcnt;
};

CPUFeatures detect_cpu_features() {
    CPUFeatures f = {};
    int info[4]; // EAX, EBX, ECX, EDX

    // Basic CPUID (leaf 0x1): ECX and EDX feature flags
    __cpuid(info, 1);
    f.sse41  = (info[2] & (1 << 19)) != 0; // ECX bit 19
    f.avx    = (info[2] & (1 << 28)) != 0; // ECX bit 28
    f.popcnt = (info[2] & (1 << 23)) != 0; // ECX bit 23

    // Extended features (leaf 0x7, sub-leaf 0): EBX
    __cpuidex(info, 7, 0);
    f.avx2 = (info[1] & (1 << 5)) != 0;    // EBX bit 5

    return f;
}

void print_cpu_info() {
    int info[4];
    char brand[49] = {};

    // CPU brand string: leaves 0x80000002–0x80000004
    __cpuid(info, 0x80000002);
    memcpy(brand, info, 16);
    __cpuid(info, 0x80000003);
    memcpy(brand + 16, info, 16);
    __cpuid(info, 0x80000004);
    memcpy(brand + 32, info, 16);

    printf("CPU: %s\n", brand);

    auto f = detect_cpu_features();
    printf("SSE4.1: %s\n", f.sse41  ? "Yes" : "No");
    printf("AVX:    %s\n", f.avx    ? "Yes" : "No");
    printf("AVX2:   %s\n", f.avx2   ? "Yes" : "No");
    printf("POPCNT: %s\n", f.popcnt ? "Yes" : "No");
}

int main() {
    print_cpu_info();
    return 0;
}

Bit Manipulation Intrinsics

`_BitScanForward` and `_BitScanReverse`

These intrinsics map to the BSF and BSR x86 instructions — finding the position of the lowest or highest set bit:

#include <intrin.h>
#include <stdio.h>

int main() {
    unsigned long mask = 0b0001'1010'0000; // Bits 5, 6, and 8 set
    unsigned long index;

    // Find lowest set bit
    if (_BitScanForward(&index, mask)) {
        printf("Lowest set bit: %lu\n", index);  // 5
    }

    // Find highest set bit
    if (_BitScanReverse(&index, mask)) {
        printf("Highest set bit: %lu\n", index); // 8
    }

    // 64-bit variants
    unsigned __int64 mask64 = (1ULL << 63) | (1ULL << 7);
    unsigned long idx64;

    _BitScanForward64(&idx64, mask64);
    printf("BSF64: %lu\n", idx64); // 7

    _BitScanReverse64(&idx64, mask64);
    printf("BSR64: %lu\n", idx64); // 63

    return 0;
}

`__popcnt` — Population Count

Counts the number of set bits in an integer (maps to the POPCNT instruction):

#include <intrin.h>
#include <stdio.h>

int main() {
    unsigned int v32 = 0b1010'1010'1010'1010;
    printf("popcount(0xAAAA) = %d\n", __popcnt(v32)); // 8

    unsigned __int64 v64 = 0xFFFFFFFF00000000ULL;
    printf("popcnt64 = %d\n", (int)__popcnt64(v64)); // 32

    return 0;
}

`__rdtsc` — Read Time-Stamp Counter

Maps to the RDTSC instruction — reads the processor’s cycle counter. Useful for micro-benchmarking:

#include <intrin.h>
#include <stdio.h>

int main() {
    unsigned __int64 start = __rdtsc();

    volatile double result = 0.0;
    for (int i = 0; i < 1000000; i++)
        result += (double)i * 0.001;

    unsigned __int64 end = __rdtsc();
    printf("Loop took %llu CPU cycles\n", end - start);
    printf("Result: %f (prevents optimization)\n", result);
    return 0;
}

__rdtsc counts CPU cycles, not wall-clock time. On CPUs with variable clock speeds (SpeedStep, Turbo Boost), the relationship between cycle count and real time varies. Use QueryPerformanceCounter or omp_get_wtime for wall-clock timing.

SSE Intrinsics — 128-bit SIMD

SSE operates on __m128 (four 32-bit floats), __m128d (two 64-bit doubles), or __m128i (integers):

#include <xmmintrin.h>  // SSE
#include <emmintrin.h>  // SSE2
#include <stdio.h>

// Add two arrays of floats using SSE: processes 4 floats per cycle
void sse_add(const float* a, const float* b, float* c, int n) {
    int i = 0;
    // Process 4 elements at a time
    for (; i <= n - 4; i += 4) {
        __m128 va = _mm_loadu_ps(a + i);  // Load 4 unaligned floats
        __m128 vb = _mm_loadu_ps(b + i);  // Load 4 unaligned floats
        __m128 vc = _mm_add_ps(va, vb);   // Add 4 floats in parallel
        _mm_storeu_ps(c + i, vc);         // Store 4 results
    }
    // Handle remaining elements
    for (; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    float a[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
    float b[] = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f};
    float c[5];

    sse_add(a, b, c, 5);

    for (int i = 0; i < 5; i++)
        printf("c[%d] = %.1f\n", i, c[i]); // 11, 22, 33, 44, 55
    return 0;
}

AVX2 Intrinsics — 256-bit SIMD

AVX2 doubles the SIMD width to 256 bits, processing 8 floats or 4 doubles per instruction. Requires /arch:AVX2:

#include <immintrin.h>  // AVX/AVX2
#include <stdio.h>

// Dot product of two float arrays using AVX2
float avx2_dot_product(const float* a, const float* b, int n) {
    __m256 sum = _mm256_setzero_ps();  // Initialize accumulator to zero

    int i = 0;
    for (; i <= n - 8; i += 8) {
        // Load 8 floats from each array
        __m256 va = _mm256_loadu_ps(a + i);
        __m256 vb = _mm256_loadu_ps(b + i);
        // Fused multiply-add: sum += va * vb (requires /arch:AVX2)
        sum = _mm256_fmadd_ps(va, vb, sum);
    }

    // Horizontal sum of the 8 lanes
    // Store the __m256 result and sum manually
    float parts[8];
    _mm256_storeu_ps(parts, sum);
    float total = parts[0]+parts[1]+parts[2]+parts[3]
                 +parts[4]+parts[5]+parts[6]+parts[7];

    // Handle remaining elements
    for (; i < n; i++) total += a[i] * b[i];
    return total;
}

int main() {
    float a[16], b[16];
    for (int i = 0; i < 16; i++) { a[i] = (float)i; b[i] = 1.0f; }
    printf("Dot product = %.1f\n", avx2_dot_product(a, b, 16)); // 120.0
    return 0;
}

Common Intrinsic Reference

Intrinsic	Instruction	Description
`__cpuid(info, leaf)`	`CPUID`	Query CPU features
`_BitScanForward(&idx, mask)`	`BSF`	Find lowest set bit
`_BitScanReverse(&idx, mask)`	`BSR`	Find highest set bit
`__popcnt(v)`	`POPCNT`	Count set bits
`__rdtsc()`	`RDTSC`	Read CPU cycle counter
`_mm_add_ps(a, b)`	`ADDPS`	Add 4 floats (SSE)
`_mm256_load_ps(ptr)`	`VMOVAPS`	Load 8 aligned floats (AVX)
`_mm256_add_ps(a, b)`	`VADDPS`	Add 8 floats (AVX)
`_mm256_fmadd_ps(a,b,c)`	`VFMADD231PS`	Fused multiply-add (AVX2)
`__assume(cond)`	—	Hint optimizer that `cond` is always true
`_ReadWriteBarrier()`	—	Prevent compiler memory reorder
`__debugbreak()`	`INT 3`	Trigger debugger breakpoint

Runtime Library

MFC & ATL

Parallel Programming

Intrinsics & Portability

MSVC Compiler Intrinsics: SIMD, Bit Ops, and CPU Info

Including Intrinsic Headers

Enabling SIMD via Compiler Flags

CPU Feature Detection with `__cpuid`

Bit Manipulation Intrinsics

`_BitScanForward` and `_BitScanReverse`

`__popcnt` — Population Count

`__rdtsc` — Read Time-Stamp Counter

SSE Intrinsics — 128-bit SIMD

AVX2 Intrinsics — 256-bit SIMD

Common Intrinsic Reference

Build docs developers (and LLMs) love

Runtime Library

MFC & ATL

Parallel Programming

Intrinsics & Portability

Documentation Index

​Including Intrinsic Headers

​Enabling SIMD via Compiler Flags

​CPU Feature Detection with __cpuid

​Bit Manipulation Intrinsics

​_BitScanForward and _BitScanReverse

​__popcnt — Population Count

​__rdtsc — Read Time-Stamp Counter

​SSE Intrinsics — 128-bit SIMD

​AVX2 Intrinsics — 256-bit SIMD

​Common Intrinsic Reference

Build docs developers (and LLMs) love

Including Intrinsic Headers

Enabling SIMD via Compiler Flags

CPU Feature Detection with `__cpuid`

Bit Manipulation Intrinsics

`_BitScanForward` and `_BitScanReverse`

`__popcnt` — Population Count

`__rdtsc` — Read Time-Stamp Counter

SSE Intrinsics — 128-bit SIMD

AVX2 Intrinsics — 256-bit SIMD

Common Intrinsic Reference