© The Author(s), under exclusive license to APress Media, LLC, part of Springer Nature 2022
D. KusswurmModern Parallel Programming with C++ and Assembly Languagehttps://doi.org/10.1007/978-1-4842-7918-2_8

8. AVX-512 C++ Programming: Part 2

Daniel Kusswurm1  
(1)
Geneva, IL, USA
 

In earlier chapters, you studied a variety of source code examples that demonstrated floating-point calculations and algorithms using AVX and AVX2 C++ SIMD intrinsic functions. In this chapter, you will examine similar source code examples that highlight the use of AVX-512 C++ SIMD intrinsic functions that perform floating-point operations. The first section contains two source code examples that illustrate simple floating-point arithmetic using 512-bit wide operands. The next two sections focus on using AVX-512 to perform computations with floating-point arrays and matrices. The final section explains how to perform discrete convolutions using AVX-512.

Floating-Point Arithmetic

In this section, you will learn how to perform elementary floating-point arithmetic using AVX-512 C++ SIMD intrinsic functions. You will also learn how to carry out merge masking and zero masking using floating-point operands.

Basic Arithmetic

Listing 8-1 contains the source code for example Ch08_01. This example demonstrates basic arithmetic operations using 512-bit wide packed floating-point operands.
//------------------------------------------------
//               Ch08_01.h
//------------------------------------------------
#pragma once
#include "ZmmVal.h"
// Ch08_01_fcpp.cpp
extern void PackedMathF32_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b);
extern void PackedMathF64_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b);
//------------------------------------------------
//               Ch08_01.cpp
//------------------------------------------------
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include "Ch08_01.h"
static void PackedMathF32(void);
static void PackedMathF64(void);
int main()
{
    PackedMathF32();
    PackedMathF64();
}
static void PackedMathF32(void)
{
    ZmmVal a, b, c[8];
    const char nl = ' ';
    a.m_F32[0] = 36.0f;                 b.m_F32[0] = -0.1111111f;
    a.m_F32[1] = 0.03125f;              b.m_F32[1] = 64.0f;
    a.m_F32[2] = 2.0f;                  b.m_F32[2] = -0.0625f;
    a.m_F32[3] = 42.0f;                 b.m_F32[3] = 8.666667f;
    a.m_F32[4] = 7.0f;                  b.m_F32[4] = -18.125f;
    a.m_F32[5] = 20.5f;                 b.m_F32[5] = 56.0f;
    a.m_F32[6] = 36.125f;               b.m_F32[6] = 24.0f;
    a.m_F32[7] = 0.5f;                  b.m_F32[7] = -158.6f;
    a.m_F32[8] = 136.0f;                b.m_F32[8] = -9.1111111f;
    a.m_F32[9] = 2.03125f;              b.m_F32[9] = 864.0f;
    a.m_F32[10] = 32.0f;                b.m_F32[10] = -70.0625f;
    a.m_F32[11] = 442.0f;               b.m_F32[11] = 98.666667f;
    a.m_F32[12] = 57.0f;                b.m_F32[12] = -518.125f;
    a.m_F32[13] = 620.5f;               b.m_F32[13] = 456.0f;
    a.m_F32[14] = 736.125f;             b.m_F32[14] = 324.0f;
    a.m_F32[15] = 80.5f;                b.m_F32[15] = -298.6f;
    PackedMathF32_Iavx512(c, &a, &b);
    std::cout << (" Results for PackedMathF32 ");
    for (unsigned int i = 0; i < 4; i++)
    {
        std::cout << "Group #" << i << nl;
        std::cout << "  a:       " << a.ToStringF32(i) << nl;
        std::cout << "  b:       " << b.ToStringF32(i) << nl;
        std::cout << "  addps:   " << c[0].ToStringF32(i) << nl;
        std::cout << "  subps:   " << c[1].ToStringF32(i) << nl;
        std::cout << "  mulps:   " << c[2].ToStringF32(i) << nl;
        std::cout << "  divps:   " << c[3].ToStringF32(i) << nl;
        std::cout << "  minps:   " << c[4].ToStringF32(i) << nl;
        std::cout << "  maxps:   " << c[5].ToStringF32(i) << nl;
        std::cout << "  sqrtps:  " << c[6].ToStringF32(i) << nl;
        std::cout << "  absps:   " << c[7].ToStringF32(i) << nl;
        std::cout << nl;
    }
}
static void PackedMathF64(void)
{
    ZmmVal a, b, c[8];
    const char nl = ' ';
    a.m_F64[0] = 2.0;           b.m_F64[0] = M_PI;
    a.m_F64[1] = 4.875;         b.m_F64[1] = M_E;
    a.m_F64[2] = 7.5;           b.m_F64[2] = -9.125;
    a.m_F64[3] = 3.0;           b.m_F64[3] = -M_PI;
    a.m_F64[4] = 12.3333;       b.m_F64[4] = M_PI / 2;
    a.m_F64[5] = 24.0;          b.m_F64[5] = M_E / 2;
    a.m_F64[6] = 37.5;          b.m_F64[6] = -9.125 / 2;
    a.m_F64[7] = 43.0;          b.m_F64[7] = -M_PI / 2;
    PackedMathF64_Iavx512(c, &a, &b);
    std::cout << (" Results for PackedMathF64 ");
    for (unsigned int i = 0; i < 4; i++)
    {
        std::cout << "Group #" << i << nl;
        std::cout << "  a:       " << a.ToStringF64(i) << nl;
        std::cout << "  b:       " << b.ToStringF64(i) << nl;
        std::cout << "  addpd:   " << c[0].ToStringF64(i) << nl;
        std::cout << "  subpd:   " << c[1].ToStringF64(i) << nl;
        std::cout << "  mulpd:   " << c[2].ToStringF64(i) << nl;
        std::cout << "  divpd:   " << c[3].ToStringF64(i) << nl;
        std::cout << "  minpd:   " << c[4].ToStringF64(i) << nl;
        std::cout << "  maxpd:   " << c[5].ToStringF64(i) << nl;
        std::cout << "  sqrtpd:  " << c[6].ToStringF64(i) << nl;
        std::cout << "  abspd:   " << c[7].ToStringF64(i) << nl;
        std::cout << nl;
    }
}
//------------------------------------------------
//               Ch08_01_fcpp.cpp
//------------------------------------------------
#include <immintrin.h>
#include "Ch08_01.h"
void PackedMathF32_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b)
{
    __m512 a_vals = _mm512_load_ps(a);
    __m512 b_vals = _mm512_load_ps(b);
    __m512 c_vals0 = _mm512_add_ps(a_vals, b_vals);
    __m512 c_vals1 = _mm512_sub_ps(a_vals, b_vals);
    __m512 c_vals2 = _mm512_mul_ps(a_vals, b_vals);
    __m512 c_vals3 = _mm512_div_ps(a_vals, b_vals);
    __m512 c_vals4 = _mm512_min_ps(a_vals, b_vals);
    __m512 c_vals5 = _mm512_max_ps(a_vals, b_vals);
    __m512 c_vals6 = _mm512_sqrt_ps(a_vals);
    __m512 c_vals7 = _mm512_abs_ps(b_vals);
    _mm512_store_ps(&c[0], c_vals0);
    _mm512_store_ps(&c[1], c_vals1);
    _mm512_store_ps(&c[2], c_vals2);
    _mm512_store_ps(&c[3], c_vals3);
    _mm512_store_ps(&c[4], c_vals4);
    _mm512_store_ps(&c[5], c_vals5);
    _mm512_store_ps(&c[6], c_vals6);
    _mm512_store_ps(&c[7], c_vals7);
}
void PackedMathF64_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b)
{
    __m512d a_vals = _mm512_load_pd(a);
    __m512d b_vals = _mm512_load_pd(b);
    __m512d c_vals0 = _mm512_add_pd(a_vals, b_vals);
    __m512d c_vals1 = _mm512_sub_pd(a_vals, b_vals);
    __m512d c_vals2 = _mm512_mul_pd(a_vals, b_vals);
    __m512d c_vals3 = _mm512_div_pd(a_vals, b_vals);
    __m512d c_vals4 = _mm512_min_pd(a_vals, b_vals);
    __m512d c_vals5 = _mm512_max_pd(a_vals, b_vals);
    __m512d c_vals6 = _mm512_sqrt_pd(a_vals);
    __m512d c_vals7 = _mm512_abs_pd(b_vals);
    _mm512_store_pd(&c[0], c_vals0);
    _mm512_store_pd(&c[1], c_vals1);
    _mm512_store_pd(&c[2], c_vals2);
    _mm512_store_pd(&c[3], c_vals3);
    _mm512_store_pd(&c[4], c_vals4);
    _mm512_store_pd(&c[5], c_vals5);
    _mm512_store_pd(&c[6], c_vals6);
    _mm512_store_pd(&c[7], c_vals7);
}
Listing 8-1

Example Ch08_01

Toward the top of Listing 8-1 are the function declarations for example Ch08_01. Note that these declarations use the ZmmVal structure that you learned about in Chapter 7. The file Ch08_01.cpp contains two functions named PackedMathF32() and PackedMathF64(). These functions perform test case initialization for the SIMD calculating functions PackedMathF32_Iavx512() and PackedMathF64_Iavx512(). They also stream results to std::cout.

The file Ch08_01_fcpp.cpp begins with the definition of function PackedMathF32_Iavx512(). This function uses the C++ SIMD intrinsic function _mm512_load_ps() to initialize a_vals and b_vals. The next code block consists of C++ SIMD intrinsic function calls that perform various AVX-512 arithmetic operations using packed single-precision floating-point operands. This is followed by a series of _mm512_store_ps() calls that save the calculated results. Note that both _mm512_load_ps() and _mm512_store_ps() require their memory operands to be aligned on a 64-byte boundary.

Function PackedMathF64_Iavx512() is the double-precision floating-point counterpart of PackedMathF32_Iavx512(). Note that this function uses the double-precision (_pd) variants of the same C++ SIMD intrinsic functions. Here are the results for source code example Ch08_01:
Results for PackedMathF32
Group #0
  a:        36.000000        0.031250   |        2.000000       42.000000
  b:        -0.111111       64.000000   |       -0.062500        8.666667
  addps:    35.888889       64.031250   |        1.937500       50.666668
  subps:    36.111111      -63.968750   |        2.062500       33.333332
  mulps:    -4.000000        2.000000   |       -0.125000      364.000000
  divps:  -324.000031        0.000488   |      -32.000000        4.846154
  minps:    -0.111111        0.031250   |       -0.062500        8.666667
  maxps:    36.000000       64.000000   |        2.000000       42.000000
  sqrtps:   6.000000         0.176777   |        1.414214        6.480741
  absps:    0.111111        64.000000   |        0.062500        8.666667
Group #1
  a:           7.000000       20.500000   |       36.125000        0.500000
  b:         -18.125000       56.000000   |       24.000000     -158.600006
  addps:     -11.125000       76.500000   |       60.125000     -158.100006
  subps:      25.125000      -35.500000   |       12.125000      159.100006
  mulps:    -126.875000     1148.000000   |      867.000000      -79.300003
  divps:      -0.386207        0.366071   |        1.505208       -0.003153
  minps:     -18.125000       20.500000   |       24.000000     -158.600006
  maxps:       7.000000       56.000000   |       36.125000        0.500000
  sqrtps:      2.645751        4.527693   |        6.010407        0.707107
  absps:      18.125000       56.000000   |       24.000000      158.600006
Group #2
  a:         136.000000        2.031250   |       32.000000      442.000000
  b:          -9.111111      864.000000   |      -70.062500       98.666664
  addps:     126.888885      866.031250   |      -38.062500      540.666687
  subps:     145.111115     -861.968750   |      102.062500      343.333344
  mulps:   -1239.111084     1755.000000   |    -2242.000000    43610.664062
  divps:     -14.926830        0.002351   |       -0.456735        4.479730
  minps:      -9.111111        2.031250   |      -70.062500       98.666664
  maxps:     136.000000      864.000000   |       32.000000      442.000000
  sqrtps:     11.661903        1.425219   |        5.656854       21.023796
  absps:       9.111111      864.000000   |       70.062500       98.666664
Group #3
  a:          57.000000      620.500000   |      736.125000       80.500000
  b:        -518.125000      456.000000   |      324.000000     -298.600006
  addps:    -461.125000     1076.500000   |     1060.125000     -218.100006
  subps:     575.125000      164.500000   |      412.125000      379.100006
  mulps:  -29533.125000   282948.000000   |   238504.500000   -24037.300781
  divps:      -0.110012        1.360746   |        2.271991       -0.269591
  minps:    -518.125000      456.000000   |      324.000000     -298.600006
  maxps:      57.000000      620.500000   |      736.125000       80.500000
  sqrtps:      7.549834       24.909838   |       27.131624        8.972179
  absps:     518.125000      456.000000   |      324.000000      298.600006
Results for PackedMathF64
Group #0
  a:                     2.000000000000   |                  4.875000000000
  b:                     3.141592653590   |                  2.718281828459
  addpd:                 5.141592653590   |                  7.593281828459
  subpd:                -1.141592653590   |                  2.156718171541
  mulpd:                 6.283185307180   |                 13.251623913738
  divpd:                 0.636619772368   |                  1.793412275711
  minpd:                 2.000000000000   |                  2.718281828459
  maxpd:                 3.141592653590   |                  4.875000000000
  sqrtpd:                1.414213562373   |                  2.207940216582
  abspd:                 3.141592653590   |                  2.718281828459
Group #1
  a:                     7.500000000000   |                  3.000000000000
  b:                    -9.125000000000   |                 -3.141592653590
  addpd:                -1.625000000000   |                 -0.141592653590
  subpd:                16.625000000000   |                  6.141592653590
  mulpd:               -68.437500000000   |                 -9.424777960769
  divpd:                -0.821917808219   |                 -0.954929658551
  minpd:                -9.125000000000   |                 -3.141592653590
  maxpd:                 7.500000000000   |                  3.000000000000
  sqrtpd:                2.738612787526   |                  1.732050807569
  abspd:                 9.125000000000   |                  3.141592653590
Group #2
  a:                    12.333300000000   |                 24.000000000000
  b:                     1.570796326795   |                  1.359140914230
  addpd:                13.904096326795   |                 25.359140914230
  subpd:                10.762503673205   |                 22.640859085770
  mulpd:                19.373102337259   |                 32.619381941509
  divpd:                 7.851622638541   |                 17.658213176229
  minpd:                 1.570796326795   |                  1.359140914230
  maxpd:                12.333300000000   |                 24.000000000000
  sqrtpd:                3.511879838491   |                  4.898979485566
  abspd:                 1.570796326795   |                  1.359140914230
Group #3
  a:                    37.500000000000   |                 43.000000000000
  b:                    -4.562500000000   |                 -1.570796326795
  addpd:                32.937500000000   |                 41.429203673205
  subpd:                42.062500000000   |                 44.570796326795
  mulpd:              -171.093750000000   |                -67.544242052181
  divpd:                -8.219178082192   |                -27.374650211806
  minpd:                -4.562500000000   |                 -1.570796326795
  maxpd:                37.500000000000   |                 43.000000000000
  sqrtpd:                6.123724356958   |                  6.557438524302
  abspd:                 4.562500000000   |                  1.570796326795

Compare Operations

The next source code example is named Ch08_02. This example demonstrates packed floating-point compare operations using 512-bit wide SIMD operands. It also highlights mask merging operations using floating-point elements. Listing 8-2 shows the source code for example Ch08_02.
//------------------------------------------------
//               Ch08_02.h
//------------------------------------------------
#pragma once
#include <cstdint>
#include "ZmmVal.h"
// Ch08_02_fcpp.cpp
extern void PackedCompareF32_Iavx512(uint16_t c[8], const ZmmVal* a, const ZmmVal* b);
extern void PackedCompareF64_Iavx512(ZmmVal* c, const ZmmVal* a, const ZmmVal* b,
    double x1, double x2, double x3);
//------------------------------------------------
//               Ch08_02.cpp
//------------------------------------------------
#include <iostream>
#include <iomanip>
#include <limits>
#define _USE_MATH_DEFINES
#include <math.h>
#include "Ch08_02.h"
static void PackedCompareF32(void);
static void PackedCompareF64(void);
static const char* c_CmpStr[8] = { "EQ", "NE", "LT", "LE", "GT", "GE", "OD", "UO" };
int main()
{
    PackedCompareF32();
    PackedCompareF64();
    return 0;
}
static void PackedCompareF32(void)
{
    ZmmVal a, b;
    uint16_t c[8];
    const char nl = ' ';
    constexpr float qnan_f32 = std::numeric_limits<float>::quiet_NaN();
    a.m_F32[0] = 2.0f;                  b.m_F32[0] = 1.0f;
    a.m_F32[1] = 7.0f;                  b.m_F32[1] = 12.0f;
    a.m_F32[2] = -6.0f;                 b.m_F32[2] = -6.0f;
    a.m_F32[3] = 3.0f;                  b.m_F32[3] = 8.0f;
    a.m_F32[4] = -16.0f;                b.m_F32[4] = -36.0f;
    a.m_F32[5] = 3.5f;                  b.m_F32[5] = 3.5f;
    a.m_F32[6] = (float)M_PI;           b.m_F32[6] = -6.0f;
    a.m_F32[7] = (float)M_SQRT2;        b.m_F32[7] = qnan_f32;
    a.m_F32[8] = 102.0f;                b.m_F32[8] = (float)M_SQRT1_2;
    a.m_F32[9] = 77.0f;                 b.m_F32[9] = 77.0f;
    a.m_F32[10] = 187.0f;               b.m_F32[10] = 33.0f;
    a.m_F32[11] = -5.1f;                b.m_F32[11] = -87.0f;
    a.m_F32[12] = 16.0f;                b.m_F32[12] = 936.0f;
    a.m_F32[13] = 0.5f;                 b.m_F32[13] = 0.5f;
    a.m_F32[14] = (float)(M_PI * 2);    b.m_F32[14] = 66.6667f;
    a.m_F32[15] = (float)(1.0/M_SQRT2); b.m_F32[15] = 100.7f;
    PackedCompareF32_Iavx512(c, &a, &b);
    const size_t w1 = 10;
    const size_t w2 = 6;
    std::cout << (" Results for PackedCompareF32 ");
    std::cout << std::fixed << std::setprecision(4);
    std::cout << "      a          b    ";
    for (unsigned int j = 0; j < 8; j++)
        std::cout << std::setw(w2) << c_CmpStr[j];
    std::cout << nl << std::string(70, '-') << nl;
    for (unsigned int i = 0; i < 16; i++)
    {
        std::cout << std::setw(w1) << a.m_F32[i];
        std::cout << std::setw(w1) << b.m_F32[i];
        for (unsigned int j = 0; j < 8; j++)
            std::cout << std::setw(w2) << ((c[j] & (1 << i)) ? 1 : 0);
        std::cout << nl;
    }
}
static void PackedCompareF64(void)
{
    ZmmVal a, b, c;
    const double x1 = -20.0, x2 = 20.0, x3 = 999.0;
    const char nl = ' ';
    a.m_F64[0] = 5.0;           b.m_F64[0] = -1.0;
    a.m_F64[1] = 21.0;          b.m_F64[1] =  2.0;
    a.m_F64[2] = 3.0;           b.m_F64[2] = 14.0;
    a.m_F64[3] = 4.0;           b.m_F64[3] = -9.0;
    a.m_F64[4] = 52.0;          b.m_F64[4] = -5.0;
    a.m_F64[5] = 6.0;           b.m_F64[5] = 6.0;
    a.m_F64[6] = 7.0;           b.m_F64[6] = -37.0;
    a.m_F64[7] = 8.0;           b.m_F64[7] = -9.0;
    PackedCompareF64_Iavx512(&c, &a, &b, x1, x2, x3);
    const size_t w1 = 8;
    const size_t w2 = 12;
    std::cout << std::fixed << std::setprecision(4);
    std::cout << (" Results for PackedCompareF64 ");
    std::cout << "(x1 = " << std::setw(w1) << x1;
    std::cout << " x2 = " << std::setw(w1) << x2 << ") ";
    for (unsigned int i = 0; i < 8; i++)
    {
        std::cout << std::setw(w2) << a.m_F64[i];
        std::cout << std::setw(w2) << b.m_F64[i];
        std::cout << std::setw(w2) << c.m_F64[i];
        std::cout << nl;
    }
}
//------------------------------------------------
//               Ch08_02_fcpp.cpp
//------------------------------------------------
#include <immintrin.h>
#include "Ch08_02.h"
void PackedCompareF32_Iavx512(uint16_t c[8], const ZmmVal* a, const ZmmVal* b)
{
    __m512 a_vals = _mm512_load_ps(a);
    __m512 b_vals = _mm512_load_ps(b);
    // Perform packed F32 compares
    c[0] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_EQ_OQ);
    c[1] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_NEQ_OQ);
    c[2] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_LT_OQ);
    c[3] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_LE_OQ);
    c[4] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_GT_OQ);
    c[5] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_GE_OQ);
    c[6] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_ORD_Q);
    c[7] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_UNORD_Q);
}
void PackedCompareF64_Iavx512(ZmmVal* c, const ZmmVal* a, const ZmmVal* b,
    double x1, double x2, double x3)
{
    __m512d a_vals = _mm512_load_pd(a);
    __m512d b_vals = _mm512_load_pd(b);
    __m512d x1_vals = _mm512_set1_pd(x1);
    __m512d x2_vals = _mm512_set1_pd(x2);
    __m512d x3_vals = _mm512_set1_pd(x3);
    __mmask8 m;
    m  = _mm512_cmp_pd_mask(a_vals, x1_vals, _CMP_GE_OQ);
    m &= _mm512_cmp_pd_mask(a_vals, x2_vals, _CMP_LE_OQ);
    m &= _mm512_cmp_pd_mask(b_vals, x1_vals, _CMP_GE_OQ);
    m &= _mm512_cmp_pd_mask(b_vals, x2_vals, _CMP_LE_OQ);
    __m512d temp0 = _mm512_mask_mul_pd(x3_vals, m, a_vals, b_vals);
    _mm512_store_pd(c, temp0);
}
Listing 8-2

Example Ch08_02

In Listing 8-2, the files Ch08_02.h and Ch08_02.cpp contain function declarations and test case initialization code for this source code example. The first function in file Ch08_02_fcpp.cpp, PackedCompareF32_Iavx512(), performs SIMD compare operations using packed single-precision floating-point operands. Unlike AVX and AVX2, AVX-512 SIMD floating-point compare operations return scalar integers that signify the results. In the current example, the C++ SIMD intrinsic function _mm512_cmp_ps_mask() returns an integer value of type __mmask16 . Each bit position of this 16-bit wide mask value reports the compare result for the corresponding SIMD operand element position (1 = compare predicate true, 0 = compare predicate false). Function _mm512_cmp_ps_mask() uses the same compare predicates that _mm256_cmp_ps() uses (see example Ch03_02).

The next function in file Ch08_02_fcpp.cpp is named PackedCompareF64_Iavx512(). This function demonstrates execution of the C++ SIMD intrinsic function _mm512_cmp_pd_mask(). Note that the mask value calculated by the four _mm512_cmp_pd_mask() calls is used in the ensuing call to _mm512_mask_mul_pd() , which carries out multiplication using only those element pairs whose values are greater than or equal to x1 and less than or equal to x2. Element pairs whose values do not meet these criteria are set to x3. This (somewhat contrived) computation illustrates how to perform merge masking using packed double-precision floating-point operands. Most AVX-512 floating-point C++ SIMD intrinsic functions support both merge masking and zero masking variants. Here are the results for source code example Ch08_02:
Results for PackedCompareF32
      a          b       EQ    NE    LT    LE    GT    GE    OD    UO
----------------------------------------------------------------------
    2.0000    1.0000     0     1     0     0     1     1     1     0
    7.0000   12.0000     0     1     1     1     0     0     1     0
   -6.0000   -6.0000     1     0     0     1     0     1     1     0
    3.0000    8.0000     0     1     1     1     0     0     1     0
  -16.0000  -36.0000     0     1     0     0     1     1     1     0
    3.5000    3.5000     1     0     0     1     0     1     1     0
    3.1416   -6.0000     0     1     0     0     1     1     1     0
    1.4142       nan     0     0     0     0     0     0     0     1
  102.0000    0.7071     0     1     0     0     1     1     1     0
   77.0000   77.0000     1     0     0     1     0     1     1     0
  187.0000   33.0000     0     1     0     0     1     1     1     0
   -5.1000  -87.0000     0     1     0     0     1     1     1     0
   16.0000  936.0000     0     1     1     1     0     0     1     0
    0.5000    0.5000     1     0     0     1     0     1     1     0
    6.2832   66.6667     0     1     1     1     0     0     1     0
    0.7071  100.7000     0     1     1     1     0     0     1     0
Results for PackedCompareF64 (x1 = -20.0000 x2 =  20.0000)
      5.0000     -1.0000     -5.0000
     21.0000      2.0000    999.0000
      3.0000     14.0000     42.0000
      4.0000     -9.0000    -36.0000
     52.0000     -5.0000    999.0000
      6.0000      6.0000     36.0000
      7.0000    -37.0000    999.0000
      8.0000     -9.0000    -72.0000

Floating-Point Arrays

In Chapter 3, you learned how to calculate the mean and standard deviation of an array of single-precision floating-point values using AVX (see example Ch03_04). The next source example, Ch08_03, explains how to perform these same calculations using AVX-512. Listing 8-3 shows the only mean and standard deviation calculating code for example Ch08_03. Not shown is the test case initialization and error checking code since it is almost identical to the code that you saw in example Ch03_04.
//------------------------------------------------
//               Ch08_03_fcpp.cpp
//------------------------------------------------
#include <cmath>
#include <stdexcept>
#include <immintrin.h>
#include "Ch08_03.h"
#include "AlignedMem.h"
void CalcMeanF32_Cpp(float* mean, const float* x, size_t n)
{
    if (!CheckArgs(x, n))
        throw std::runtime_error("CalcMeanF32_Cpp() - CheckArgs failed");
    float sum = 0.0f;
    for (size_t i = 0; i < n; i++)
        sum += x[i];
    *mean = sum / n;
}
void CalcStDevF32_Cpp(float* st_dev, const float* x, size_t n, float mean)
{
    if (!CheckArgs(x, n))
        throw std::runtime_error("CalcStDevF32_Cpp() - CheckArgs failed");
    float sum_squares = 0.0f;
    for (size_t i = 0; i < n; i++)
    {
        float temp = x[i] - mean;
        sum_squares += temp * temp;
    }
    *st_dev = sqrt(sum_squares / (n - 1));
}
void CalcMeanF32_Iavx512(float* mean, const float* x, size_t n)
{
    if (!CheckArgs(x, n))
        throw std::runtime_error("CalcMeanF32_Iavx512() - CheckArgs failed");
    __m512 sums = _mm512_setzero_ps();
    size_t i = 0;
    const size_t num_simd_elements = 16;
    for (; n - i >= num_simd_elements; i += num_simd_elements)
    {
        __m512 x_vals = _mm512_load_ps(&x[i]);
        sums = _mm512_add_ps(x_vals, sums);
    }
    // Reduce packed sums to scalar value, then process remaining elements
    float sum = _mm512_reduce_add_ps(sums);
    for (; i < n; i++)
        sum += x[i];
    *mean = sum / n;
}
void CalcStDevF32_Iavx512(float* st_dev, const float* x, size_t n, float mean)
{
    if (!CheckArgs(x, n))
        throw std::runtime_error("CalcStDevF32_Iavx512() - CheckArgs failed");
    __m512 packed_mean = _mm512_set1_ps(mean);
    __m512 packed_sum_squares = _mm512_setzero_ps();
    size_t i = 0;
    const size_t num_simd_elements = 16;
    for (; n - i >= num_simd_elements; i += num_simd_elements)
    {
        __m512 x_vals = _mm512_load_ps(&x[i]);
        __m512 temp1 = _mm512_sub_ps(x_vals, packed_mean);
        packed_sum_squares = _mm512_fmadd_ps(temp1, temp1, packed_sum_squares);
    }
    // Reduce packed_sum_squares to scalar value, then process remaining elements
    float sum_squares =_mm512_reduce_add_ps(packed_sum_squares);
    for (; i < n; i++)
    {
        float temp1 = x[i] - mean;
        sum_squares += temp1 * temp1;
    }
    *st_dev = sqrt(sum_squares / (n - 1));
}
Listing 8-3

Example Ch08_03

The first two functions in file Ch08_03_fcpp.cpp, CalcMeanF32_Cpp() and CalcStDevF32_Cpp(), calculate the mean and standard deviation using standard C++ statements. These functions are included in this example for comparison purposes. The next function, CalcMeanF32_Iavx512(), calculates the array mean using AVX-512 C++ SIMD intrinsic functions. Following argument validation, function CalcMeanF32_Iavx512() uses _mm512_setzero_ps() to initialize sums to zero. The variable sums contains 16 intermediate single-precision floating-point sum values. These values are updated during each iteration of the ensuing for-loop. Following execution of the for-loop, CalcMeanF32_Iavx512() uses the C++ SIMD intrinsic function _mm512_reduce_add_ps() to reduce the 16 single-precision floating-point values in sums to a single scalar value. Recall that the AVX code in example Ch03_04 employed a sequence of C++ SIMD intrinsic function calls to perform this same reduction. Following the reduction of sums, the second for-loop in CalcMeanF32_Iavx512() processes any residual elements using scalar arithmetic.

The standard deviation calculating code in function CalcStDevF32_Iavx512() uses the C++ SIMD intrinsic function _mm512_set1_ps() to initialize packed_means. This is followed by a call to _mm512_setzero_ps(), which initializes packed_sum_squares to zero. During each iteration, the first for-loop in CalcStDevF32_Iavx512() updates this value using 16 elements from array x. Following execution of the for-loop, packed_sum_squares is reduced to a scalar value using the C++ SIMD intrinsic function _mm512_reduce_add_ps(). The second for-loop in CalcStDevF32_Iavx512() processes any residual elements using scalar arithmetic. Here are the results for source code example Ch08_03:
Results for CalcMeanF32_Cpp and CalcStDevF32_Cpp
mean1:    49.602146  st_dev1:  27.758242
Results for CalcMeanF32_Iavx512 and CalcStDevF32_Iavx512
mean2:    49.602158  st_dev2:  27.758244

Like example Ch03_04, the results for source code example Ch08_03 contain some slight discrepancy values due to the non-associativity of floating-point arithmetic. Whether these discrepancies are of any consequence depends on the specific application.

Floating-Point Matrices

In Chapter 5, you studied several source code examples that explained how to perform common matrix operations using AVX2 C++ SIMD intrinsic functions. In this section, you will learn how to carry out some of the same matrix operations using AVX-512 C++ SIMD intrinsic functions. The first source code example highlights the use of AVX-512 to calculate a covariance matrix. This is followed by two source code examples that spotlight matrix multiplication. The final source code example of this section explicates matrix-vector multiplication. As you will soon see, it is often a straightforward programming task to adapt an algorithm originally written using AVX2 C++ SIMD intrinsic functions to one that exploits the computational resources of AVX-512.

Covariance Matrix

Mathematicians often use a statistical measure called covariance to quantify the extent to which two random variables vary together. When multiple random variables are being analyzed, it is common to calculate a matrix of all possible covariances. This matrix is called, unsurprisingly, a covariance matrix. Once calculated, a covariance matrix can be employed to perform a wide variety of advanced statistical analyses. Appendix B contains several references that you can consult if you are interested in learning more about covariance and covariance matrices.

The calculation of a covariance matrix begins with a sample data matrix as shown in Figure 8-1. In this figure, each row of matrix X represents one random variable (or feature). Each column in X is a multivariate observation. The elements cij in covariance matrix C are calculated using the following equation:
$$ {c}_{ij}=frac{sum limits_{k=0}^{n_{obv}-1}left({x}_{ik}-{overline{x}}_i
ight)left({x}_{jk}-{overline{x}}_j
ight)}{n_{obv}-1} $$
where i = 0, 1, ⋯, nvar − 1 and j = 0, 1, ⋯, nvar − 1. In these equations, the symbols nobv and nvar signify the number of observations and variables, respectively. A covariance matrix is always a square (nvar × nvar) symmetric (cij = cji) matrix as shown in Figure 8-1. Each covariance matrix element cij represents the covariance between random variables xi and xj, and each main diagonal element cii is the variance for variable xi.
Figure 8-1

Example data matrix and covariance matrix

Listing 8-4 shows the source code for example Ch08_04. This example demonstrates how to calculate a covariance matrix using AVX-512 SIMD arithmetic.
//------------------------------------------------
//               Ch08_04.h
//------------------------------------------------
#pragma once
#include <vector>
#include "MatrixF64.h"
// Note: In this example, CMD stands for covariance matrix data
struct CMD
{
    MatrixF64 m_X;                      // Data matrix
    MatrixF64 m_CovMat;                 // Covariance matrix
    std::vector<double> m_VarMeans;     // Variable (row) means
    CMD(size_t n_vars, size_t n_obvs) :
        m_X(n_vars, n_obvs), m_CovMat(n_vars, n_vars), m_VarMeans(n_vars) { }
};
// Ch08_04_fcpp.cpp
extern void CalcCovMatF64_Cpp(CMD& cmd);
extern void CalcCovMatF64_Iavx512(CMD& cmd);
// Ch08_04_misc.cpp
extern bool CheckArgs(const CMD& cmd);
extern bool CompareResults(CMD& cmd1, CMD& cmd2);
extern void InitCMD(CMD& cmd1, CMD& cmd2);
// Ch08_04_misc2.cpp
extern void DisplayData(const CMD& cmd);
//------------------------------------------------
//               Ch08_04_fcpp.cpp
//------------------------------------------------
#include <immintrin.h>
#include "Ch08_04.h"
void CalcCovMatF64_Cpp(CMD& cmd)
{
    if (!CheckArgs(cmd))
        throw std::runtime_error("CalcCovMatrixF64_Cpp() - CheckArgs failed");
    size_t n_vars = cmd.m_X.GetNumRows();
    size_t n_obvs = cmd.m_X.GetNumCols();
    double* cov_mat = cmd.m_CovMat.Data();
    double* x = cmd.m_X.Data();
    double* var_means = cmd.m_VarMeans.data();
    // Calculate variable means (rows of cmd.m_X)
    for (size_t i = 0; i < n_vars; i++)
    {
        var_means[i] = 0.0;
        for (size_t j = 0; j < n_obvs; j++)
            var_means[i] += x[i * n_obvs + j];
        var_means[i] /= n_obvs;
    }
    // Calculate covariance matrix
    for (size_t i = 0; i < n_vars; i++)
    {
        for (size_t j = 0; j < n_vars; j++)
        {
            if (i <= j)
            {
                double sum = 0.0;
                for (size_t k = 0; k < n_obvs; k++)
                {
                    double temp1 = x[i * n_obvs + k] - var_means[i];
                    double temp2 = x[j * n_obvs + k] - var_means[j];
                    sum += temp1 * temp2;
                }
                cov_mat[i * n_vars + j] = sum / (n_obvs - 1);
            }
            else
                cov_mat[i * n_vars + j] = cov_mat[j * n_vars + i];
        }
    }
}
inline double ReduceAddF64(__m512d a, __m256d b, __m128d c)
{
    // Sum double precision values of arguments a, b, and c
    __m256d temp0 = _mm256_setzero_pd();
    __m256d temp1 = _mm256_insertf64x2(temp0, c, 0);
    __m256d temp2 = _mm256_add_pd(temp1, b);
    __m512d temp3 = _mm512_setzero_pd();
    __m512d temp4 = _mm512_insertf64x4(temp3, temp2, 0);
    __m512d temp5 = _mm512_add_pd(temp4, a);
    return _mm512_reduce_add_pd(temp5);
}
void CalcCovMatF64_Iavx512(CMD& cmd)
{
    if (!CheckArgs(cmd))
        throw std::runtime_error("CalcCovMatrixF64_Iavx512() - CheckArgs failed");
    size_t n_vars = cmd.m_X.GetNumRows();
    size_t n_obvs = cmd.m_X.GetNumCols();
    double* cov_mat = cmd.m_CovMat.Data();
    double* x = cmd.m_X.Data();
    double* var_means = cmd.m_VarMeans.data();
    const size_t num_simd_elements8 = 8;
    const size_t num_simd_elements4 = 4;
    const size_t num_simd_elements2 = 2;
    // Calculate variable means (rows of cmd.m_X)
    for (size_t i = 0; i < n_vars; i++)
    {
        size_t j = 0;
         __m512d sums_512 = _mm512_setzero_pd();
         __m256d sums_256 = _mm256_setzero_pd();
         __m128d sums_128 = _mm_setzero_pd();
        while (j < n_obvs)
        {
            if (n_obvs - j >= num_simd_elements8)
            {
                __m512d x_vals = _mm512_loadu_pd(&x[i * n_obvs + j]);
                sums_512 = _mm512_add_pd(x_vals, sums_512);
                j += num_simd_elements8;
            }
            else if (n_obvs - j >= num_simd_elements4)
            {
                __m256d x_vals = _mm256_loadu_pd(&x[i * n_obvs + j]);
                sums_256 = _mm256_add_pd(x_vals, sums_256);
                j += num_simd_elements4;
            }
            else if (n_obvs - j >= num_simd_elements2)
            {
                __m128d x_vals = _mm_loadu_pd(&x[i * n_obvs + j]);
                sums_128 = _mm_add_pd(x_vals, sums_128);
                j += num_simd_elements2;
            }
            else
            {
                __m128d x_val = _mm_load_sd(&x[i * n_obvs + j]);
                sums_128 = _mm_add_pd(x_val, sums_128);
                j += 1;
            }
        }
        double sum = ReduceAddF64(sums_512, sums_256, sums_128);
        var_means[i] = sum / n_obvs;
    }
    // Calculate covariance matrix
    for (size_t i = 0; i < n_vars; i++)
    {
        for (size_t j = 0; j < n_vars; j++)
        {
            if (i <= j)
            {
                size_t k = 0;
                __m512d sums_512 = _mm512_setzero_pd();
                __m256d sums_256 = _mm256_setzero_pd();
                __m128d sums_128 = _mm_setzero_pd();
                while (k < n_obvs)
                {
                    if (n_obvs - k >= num_simd_elements8)
                    {
                        __m512d var_means1_512 = _mm512_set1_pd(var_means[i]);
                        __m512d var_means2_512 = _mm512_set1_pd(var_means[j]);
                        __m512d x_vals1 = _mm512_loadu_pd(&x[i * n_obvs + k]);
                        __m512d x_vals2 = _mm512_loadu_pd(&x[j * n_obvs + k]);
                        __m512d temp1 = _mm512_sub_pd(x_vals1, var_means1_512);
                        __m512d temp2 = _mm512_sub_pd(x_vals2, var_means2_512);
                        sums_512 = _mm512_fmadd_pd(temp1, temp2, sums_512);
                        k += num_simd_elements8;
                    }
                    else if (n_obvs - k >= num_simd_elements4)
                    {
                        __m256d var_means1_256 = _mm256_set1_pd(var_means[i]);
                        __m256d var_means2_256 = _mm256_set1_pd(var_means[j]);
                        __m256d x_vals1 = _mm256_loadu_pd(&x[i * n_obvs + k]);
                        __m256d x_vals2 = _mm256_loadu_pd(&x[j * n_obvs + k]);
                        __m256d temp1 = _mm256_sub_pd(x_vals1, var_means1_256);
                        __m256d temp2 = _mm256_sub_pd(x_vals2, var_means2_256);
                        sums_256 = _mm256_fmadd_pd(temp1, temp2, sums_256);
                        k += num_simd_elements4;
                    }
                    else if (n_obvs - k >= num_simd_elements2)
                    {
                        __m128d var_means1_128 = _mm_set1_pd(var_means[i]);
                        __m128d var_means2_128 = _mm_set1_pd(var_means[j]);
                        __m128d x_vals1 = _mm_loadu_pd(&x[i * n_obvs + k]);
                        __m128d x_vals2 = _mm_loadu_pd(&x[j * n_obvs + k]);
                        __m128d temp1 = _mm_sub_pd(x_vals1, var_means1_128);
                        __m128d temp2 = _mm_sub_pd(x_vals2, var_means2_128);
                        sums_128 = _mm_fmadd_pd(temp1, temp2, sums_128);
                        k += num_simd_elements2;
                    }
                    else
                    {
                        __m128d var_means1_64 = _mm_load_sd(&var_means[i]);
                        __m128d var_means2_64 = _mm_load_sd(&var_means[j]);
                        __m128d x_vals1 = _mm_load_sd(&x[i * n_obvs + k]);
                        __m128d x_vals2 = _mm_load_sd(&x[j * n_obvs + k]);
                        __m128d temp1 = _mm_sub_pd(x_vals1, var_means1_64);
                        __m128d temp2 = _mm_sub_pd(x_vals2, var_means2_64);
                        sums_128 = _mm_fmadd_pd(temp1, temp2, sums_128);
                        k += 1;
                    }
                }
                double sum = ReduceAddF64(sums_512, sums_256, sums_128);
                cov_mat[i * n_vars + j] = sum / (n_obvs - 1);
            }
            else
                cov_mat[i * n_vars + j] = cov_mat[j * n_vars + i];
        }
    }
}
Listing 8-4

Example Ch08_04

Near the top of Listing 8-4 is the file Ch08_04.h, which begins with the definition of structure CMD (CMD = covariance matrix data). This structure contains the data matrix, the variable means vector, and the covariance matrix. Note that CMD also includes a simple constructor that allocates space for the three container objects using the specified n_vars and n_obvs. The source code that performs argument validation, test data initialization, and result comparisons is not shown in Listing 8-4 but included in the download software package.

The core calculating functions of source code example are in Ch08_08_fcpp.cpp, which begins with the definition of function CalcCovMatF64_Cpp(). This function uses standard C++ statements to calculate the covariance matrix and is included for comparison purposes. The code in CalcCovMatF64_Cpp() is split into two major sections. The first section calculates the mean for each variable (or row) in data matrix x. The second section calculates the covariances. Note that function CalcCovMatF64_Cpp() exploits the fact that a covariance matrix is symmetric and only carries out a complete calculation when i <= j is true. If i <= j is false, CalcCovMatF64_Cpp() executes cov_mat[i][j] = cov_mat[j][j].

The next function in Ch08_04_fcpp.cpp is a SIMD inline function named ReduceAddF64(). This function reduces the double-precision floating-point elements of arguments a (__m512d), b (__m256d), and c (__m128d) to a scalar double-precision value. Note that ReduceAddF64() employs several C++ SIMD intrinsic functions to size-extend argument values b and c to packed 512-bit wide SIMD values. Doing this facilitates the use of the AVX-512 C++ SIMD intrinsic function _mm512_reduce_add_pd() to perform the reduction.

The final function in Listing 8-4 is named CalcCovMatF64_Iavx512(). Like its standard C++ counterpart, function CalcCovMatF64_Iavx512() uses distinct sections of code to calculate the variable means and the covariance matrix. The mean calculating while-loop employs __m512d, __m256d, __m128d, or scalar objects to perform its computations. Note that each if section verifies that enough elements are available in the current row before carrying out any SIMD calculations. Following the while-loop, CalcCovMatF64_Iavx512() invokes ReduceAddF64() to reduce sums_512, sums_256, and sums_128 to a scalar value. It then calculates var_means[i].

Function CalcCovMatF64_Iavx512() uses a similar while-loop construct to calculate the elements of the covariance matrix. Like the function CalcCovMatF64_Cpp(), function CalcCovMatF64_Iavx512() calculates a covariance matrix element only if i <= j is true; otherwise, a covariance matrix element copy is performed. Note that the four covariance-matrix-element calculating code blocks in CalcCovMatF64_Iavx512() employ FMA arithmetic to carry out their computations. Here are the results for source code example Ch08_04:
Results for CalcCovMatF64
n_vars = 12, n_obvs = 111
Variable means
   0:     13.37     13.37
   1:     12.34     12.34
   2:     12.62     12.62
   3:     13.25     13.25
   4:     11.89     11.89
   5:     12.10     12.10
   6:     11.65     11.65
   7:     12.13     12.13
   8:     12.49     12.49
   9:     12.53     12.53
  10:     12.08     12.08
  11:     12.33     12.33
cmd1.m_CovMat
 50.61   2.00   0.60   1.41 -1.57   3.15   -2.68   4.36  -0.70   3.04   5.16  -4.16
  2.00  48.45  -6.96   2.20  5.23   1.45    0.62  -4.88  -0.52   9.59   1.86  -1.76
  0.60  -6.96  48.26 -13.01  1.15  -2.10    5.47  -5.81   3.22   0.46  -0.75   7.50
  1.41   2.20 -13.01  51.71  2.05   1.76   -5.01   7.45  -2.03   4.34  -5.41   7.26
 -1.57   5.23   1.15   2.05 55.88  -6.90   -7.11   0.17   4.78   4.57  -0.03  -3.01
  3.15   1.45  -2.10   1.76 -6.90  54.28   10.10   4.07   0.17  -0.99   1.24   3.15
 -2.68   0.62   5.47  -5.01 -7.11  10.10   56.98  -4.95  -2.19  -2.47 -10.59   1.63
  4.36  -4.88  -5.81   7.45  0.17   4.07   -4.95  52.51   3.89   1.76   4.04   1.80
 -0.70  -0.52   3.22  -2.03  4.78   0.17   -2.19   3.89  52.56   3.16  -2.82   5.82
  3.04   9.59   0.46   4.34  4.57  -0.99   -2.47   1.76   3.16  42.41   8.35   1.28
  5.16   1.86  -0.75  -5.41 -0.03   1.24  -10.59   4.04  -2.82   8.35  58.25 -11.63
 -4.16  -1.76   7.50   7.26 -3.01   3.15    1.63   1.80   5.82   1.28 -11.63  49.79
cmd2.m_CovMat
 50.61   2.00   0.60   1.41  -1.57   3.15   -2.68   4.36  -0.70   3.04   5.16  -4.16
  2.00  48.45  -6.96   2.20   5.23   1.45    0.62  -4.88  -0.52   9.59   1.86  -1.76
  0.60  -6.96  48.26 -13.01   1.15  -2.10    5.47  -5.81   3.22   0.46  -0.75   7.50
  1.41   2.20 -13.01  51.71   2.05   1.76   -5.01   7.45  -2.03   4.34  -5.41   7.26
 -1.57   5.23   1.15   2.05  55.88  -6.90   -7.11   0.17   4.78   4.57  -0.03  -3.01
  3.15   1.45  -2.10   1.76  -6.90  54.28   10.10   4.07   0.17  -0.99   1.24   3.15
 -2.68   0.62   5.47  -5.01  -7.11  10.10   56.98  -4.95  -2.19  -2.47 -10.59   1.63
  4.36  -4.88  -5.81   7.45   0.17   4.07   -4.95  52.51   3.89   1.76   4.04   1.80
 -0.70  -0.52   3.22  -2.03   4.78   0.17   -2.19   3.89  52.56   3.16  -2.82   5.82
  3.04   9.59   0.46   4.34   4.57  -0.99   -2.47   1.76   3.16  42.41   8.35   1.28
  5.16   1.86  -0.75  -5.41  -0.03   1.24  -10.59   4.04  -2.82   8.35  58.25 -11.63
 -4.16  -1.76   7.50   7.26  -3.01   3.15    1.63   1.80   5.82   1.28 -11.63  49.79
CompareResults - passed

Matrix Multiplication

In Chapter 5, you learned how to perform single-precision floating-point matrix multiplication using C++ SIMD intrinsic functions and AVX2 (see example Ch05_02). The next source code example, named Ch08_05, illustrates single-precision floating-point matrix multiplication using C++SIMD intrinsic functions and AVX-512. Listing 8-5 shows the matrix multiplication calculating code for source code example Ch08_05. Before examining this source code, you may want to review the matrix multiplication equations that were discussed in Chapter 5.
//------------------------------------------------
//               Ch08_05_fcpp.cpp
//------------------------------------------------
#include <stdexcept>
#include <immintrin.h>
#include "Ch08_05.h"
void MatrixMulF32_Cpp(MatrixF32& c, const MatrixF32& a, const MatrixF32& b)
{
    MatrixF32::Mul(c, a, b);
}
void MatrixMulF32_Iavx512(MatrixF32& c, const MatrixF32& a, const MatrixF32& b)
{
    if (!CheckArgs(c, a, b))
        throw std::runtime_error("MatrixMulF32_Iavx512() CheckArgs failed");
    const float* aa = a.Data();
    const float* bb = b.Data();
    float* cc = c.Data();
    size_t c_nrows = c.GetNumRows();
    size_t c_ncols = c.GetNumCols();
    size_t a_ncols = a.GetNumCols();
    size_t b_ncols = b.GetNumCols();
    size_t m = a_ncols;
    const size_t num_simd_elements = 16;
    size_t num_residual_cols = c_ncols % num_simd_elements;
    // res_mask = 2 ** num_residual_cols - 1
    __mmask16 res_mask = (__mmask16)((1 << num_residual_cols) - 1);
    for (size_t i = 0; i < c_nrows; i++)
    {
        size_t j = 0;
        while (j + num_simd_elements <= c_ncols)
        {
            __m512 c_vals = _mm512_setzero_ps();
            for (size_t k = 0; k < m; k++)
            {
                __m512 a_vals = _mm512_set1_ps(aa[i * a_ncols + k]);
                __m512 b_vals = _mm512_loadu_ps(&bb[k * b_ncols + j]);
                c_vals = _mm512_fmadd_ps(a_vals, b_vals, c_vals);
            }
            _mm512_storeu_ps(&cc[i * c_ncols + j], c_vals);
            j += num_simd_elements;
        }
        if (num_residual_cols != 0)
        {
            __m512 c_vals = _mm512_setzero_ps();
            for (size_t k = 0; k < m; k++)
            {
                __m512 a_vals = _mm512_set1_ps(aa[i * a_ncols + k]);
                __m512 b_vals = _mm512_maskz_loadu_ps(res_mask, &bb[k * b_ncols + j]);
                c_vals = _mm512_fmadd_ps(a_vals, b_vals, c_vals);
            }
            _mm512_mask_storeu_ps(&cc[i * c_ncols + j], res_mask, c_vals);
        }
    }
}
Listing 8-5

Example Ch08_05

Near the top of Listing 8-5 is the source code for function MatrixMulF32_Iavx512(), which performs single-precision floating-point matrix multiplication. The primary difference between this function and the function MatrixMulF32_Iavx2() that you studied in example Ch05_02 is in the code that calculates the residual column mask for the current row. In example Ch05_02, function MatrixMulF32_Iavx2() used a SIMD integer (__m256i) mask. In this example, function MatrixMulF32_Iavx512() uses a scalar integer (__mmask16) mask since these are directly supported by AVX-512.

Following its initialization tasks, function MatrixMulF32_Iavx512() calculates num_residual_cols. This value is then used to compute the mask needed to process any residual columns in the current row. The layout of MatrixMulF32_Iavx512() is akin to the Ch05_02’s layout of MatrixMulF32_Iavx2(). For nonresidual columns, function MatrixMulF32_Iavx512() uses the C++ SIMD intrinsic functions _mm512_set1_ps(), _mm512_loadu_ps(), _mm512_fmadd_ps(), and _mm512_storeu_ps() to calculate products c[i][j:j+15]. Calculation of residual column products is similar except that this code block uses the C++ SIMD intrinsic functions _mm512_maskz_loadu_ps() and _mm512_mask_storeu_ps() to perform masked load and store operations. Here are the results for source code example Ch08_05:
Results for MatrixMulF32
Matrix compare passed
Results saved to file Ch08_05_MatrixMulF32_LITHIUM.txt
Running benchmark function MatrixMulF32_bm - please wait
Benchmark times save to file Ch08_05_MatrixMulF32_bm_LITHIUM.csv
Table 8-1 shows some benchmark timing measurements for source code example Ch08_05. These measurements were made using 250 × 250 matrices.
Table 8-1

Matrix Multiplication (Single-Precision) Execution Times (Microseconds)

CPU

MatrixMulF32_Cpp()

MatrixMulF32_Iavx512()

Intel Core i5-11600K

11432

713

Listing 8-6 shows the calculating code for source code example Ch08_06, which performs double-precision floating-point matrix multiplication.
//------------------------------------------------
//               Ch08_06_fcpp.cpp
//------------------------------------------------
#include <stdexcept>
#include <immintrin.h>
#include "Ch08_06.h"
void MatrixMulF64_Cpp(MatrixF64& c, const MatrixF64& a, const MatrixF64& b)
{
    MatrixF64::Mul(c, a, b);
}
void MatrixMulF64_Iavx512(MatrixF64& c, const MatrixF64& a, const MatrixF64& b)
{
    if (!CheckArgs(c, a, b))
        throw std::runtime_error("MatrixMulF64_Iavx512() CheckArgs failed");
    const double* aa = a.Data();
    const double* bb = b.Data();
    double* cc = c.Data();
    size_t m = a.GetNumCols();
    size_t c_nrows = c.GetNumRows();
    size_t c_ncols = c.GetNumCols();
    size_t a_ncols = a.GetNumCols();
    size_t b_ncols = b.GetNumCols();
    const size_t num_simd_elements = 8;
    size_t num_residual_cols = c_ncols % num_simd_elements;
    // res_mask = 2 ** num_residual_cols - 1
    __mmask8 res_mask = (__mmask8)((1 << num_residual_cols) - 1);
    for (size_t i = 0; i < c_nrows; i++)
    {
        size_t j = 0;
        while (j + num_simd_elements <= c_ncols)
        {
            __m512d c_vals = _mm512_setzero_pd();
            for (size_t k = 0; k < m; k++)
            {
                __m512d a_vals = _mm512_set1_pd(aa[i * a_ncols + k]);
                __m512d b_vals = _mm512_loadu_pd(&bb[k * b_ncols + j]);
                c_vals = _mm512_fmadd_pd(a_vals, b_vals, c_vals);
            }
            _mm512_storeu_pd(&cc[i * c_ncols + j], c_vals);
            j += num_simd_elements;
        }
        if (num_residual_cols != 0)
        {
            __m512d c_vals = _mm512_setzero_pd();
            for (size_t k = 0; k < m; k++)
            {
                __m512d a_vals = _mm512_set1_pd(aa[i * a_ncols + k]);
                __m512d b_vals = _mm512_maskz_loadu_pd(res_mask, &bb[k * b_ncols + j]);
                c_vals = _mm512_fmadd_pd(a_vals, b_vals, c_vals);
            }
            _mm512_mask_storeu_pd(&cc[i * c_ncols + j], res_mask, c_vals);
        }
    }
}
Listing 8-6

Example Ch08_06

The primary difference between functions MatrixMulF32_Iavx512() and MatrixMulF64_Iavx512() is that the latter uses the __m512d data type and _pd variants of the C++ SIMD calculating functions. It also uses a mask of type __mmask8 instead of __mmask16. The results for Ch08_06 are the same as Ch08_05. Table 8-2 shows the benchmark timing measurements for source code example Ch08_06. These measurements were made using 250 × 250 matrices.
Table 8-2

Matrix Multiplication (Double-Precision) Execution Times (Microseconds)

CPU

MatrixMulF64_Cpp()

MatrixMulF64_Iavx512()

Intel Core i5-11600K

11972

1518

Matrix (4 x 4) Vector Multiplication

Listing 8-7 shows the calculating code for source code example Ch08_07. This example, which is an AVX-512 implementation of source code example Ch05_06, illustrates matrix-vector (4 × 4, 4 × 1) multiplication using single-precision floating-point values. Before examining the source code in Listing 8-7, you may want to review the matrix-vector multiplication equations shown in Figure 5-5.
//------------------------------------------------
//               Ch08_07_fcpp.cpp
//------------------------------------------------
#include <stdexcept>
#include <cstdint>
#include <immintrin.h>
#include "Ch08_07.h"
#include "MatrixF32.h"
#include "AlignedMem.h"
// Permutation indices for matrix columns
alignas(64) const uint32_t c_MatIndCol0[] { 0, 4, 8, 12, 0, 4, 8, 12,
                                            0, 4, 8, 12, 0, 4, 8, 12 };
alignas(64) const uint32_t c_MatIndCol1[] { 1, 5, 9, 13, 1, 5, 9, 13,
                                            1, 5, 9, 13, 1, 5, 9, 13 };
alignas(64) const uint32_t c_MatIndCol2[] { 2, 6, 10, 14, 2, 6, 10, 14,
                                            2, 6, 10, 14, 2, 6, 10, 14 };
alignas(64) const uint32_t c_MatIndCol3[] { 3, 7, 11, 15, 3, 7, 11, 15,
                                            3, 7, 11, 15, 3, 7, 11, 15 };
// Permutation indices for vector components
alignas(64) const uint32_t c_VecIndW[] { 0, 0, 0, 0, 4, 4, 4, 4,
                                         8, 8, 8, 8, 12, 12, 12, 12 };
alignas(64) const uint32_t c_VecIndX[] { 1, 1, 1, 1, 5, 5, 5, 5,
                                         9, 9, 9, 9, 13, 13, 13, 13 };
alignas(64) const uint32_t c_VecIndY[] { 2, 2, 2, 2, 6, 6, 6, 6,
                                         10, 10, 10, 10, 14, 14, 14, 14 };
alignas(64) const uint32_t c_VecIndZ[] { 3, 3, 3, 3, 7, 7, 7, 7,
                                         11, 11, 11, 11, 15, 15, 15, 15 };
void MatVecMulF32_Cpp(Vec4x1_F32* vec_b, MatrixF32& m, Vec4x1_F32* vec_a,
    size_t num_vec)
{
    if (!CheckArgs(vec_b, m, vec_a, num_vec))
        throw std::runtime_error("MatVecMulF32_Cpp() - CheckArgs failed");
    const float* mm = m.Data();
    // Calculate matrix-vector products
    for (size_t i = 0; i < num_vec; i++)
    {
        vec_b[i].W =  mm[0] * vec_a[i].W + mm[1] * vec_a[i].X;
        vec_b[i].W += mm[2] * vec_a[i].Y + mm[3] * vec_a[i].Z;
        vec_b[i].X =  mm[4] * vec_a[i].W + mm[5] * vec_a[i].X;
        vec_b[i].X += mm[6] * vec_a[i].Y + mm[7] * vec_a[i].Z;
        vec_b[i].Y =  mm[8] * vec_a[i].W + mm[9] * vec_a[i].X;
        vec_b[i].Y += mm[10] * vec_a[i].Y + mm[11] * vec_a[i].Z;
        vec_b[i].Z =  mm[12] * vec_a[i].W + mm[13] * vec_a[i].X;
        vec_b[i].Z += mm[14] * vec_a[i].Y + mm[15] * vec_a[i].Z;
    }
}
void MatVecMulF32a_Iavx512(Vec4x1_F32* vec_b, MatrixF32& m, Vec4x1_F32* vec_a, size_t num_vec)
{
    if (!CheckArgs(vec_b, m, vec_a, num_vec))
        throw std::runtime_error("MatVecMulF32a_Iavx512() - CheckArgs failed");
    // Load indices for matrix and vector permutations
    __m512i m_ind_col0 = _mm512_load_epi32(c_MatIndCol0);
    __m512i m_ind_col1 = _mm512_load_epi32(c_MatIndCol1);
    __m512i m_ind_col2 = _mm512_load_epi32(c_MatIndCol2);
    __m512i m_ind_col3 = _mm512_load_epi32(c_MatIndCol3);
    __m512i v_ind_w = _mm512_load_epi32(c_VecIndW);
    __m512i v_ind_x = _mm512_load_epi32(c_VecIndX);
    __m512i v_ind_y = _mm512_load_epi32(c_VecIndY);
    __m512i v_ind_z = _mm512_load_epi32(c_VecIndZ);
    // Load source matrix and permute 4 copies of each column
    __m512 m_vals = _mm512_load_ps(m.Data());
    __m512 m_col0_vals = _mm512_permutexvar_ps(m_ind_col0, m_vals);
    __m512 m_col1_vals = _mm512_permutexvar_ps(m_ind_col1, m_vals);
    __m512 m_col2_vals = _mm512_permutexvar_ps(m_ind_col2, m_vals);
    __m512 m_col3_vals = _mm512_permutexvar_ps(m_ind_col3, m_vals);
    // Calculate matrix-vector products
    size_t i = 0;
    const size_t num_vec_per_iteration = 4;
    for (; num_vec - i >= num_vec_per_iteration; i += num_vec_per_iteration)
    {
        // Load next block of 4 vectors
        __m512 va_vals = _mm512_load_ps(&vec_a[i]);
        // Permute vectors into W, X, Y, Z components
        __m512 va_w_vals = _mm512_permutexvar_ps(v_ind_w, va_vals);
        __m512 va_x_vals = _mm512_permutexvar_ps(v_ind_x, va_vals);
        __m512 va_y_vals = _mm512_permutexvar_ps(v_ind_y, va_vals);
        __m512 va_z_vals = _mm512_permutexvar_ps(v_ind_z, va_vals);
        // Calculate and save matrix-vector products
        __m512 vb_vals = _mm512_mul_ps(m_col0_vals, va_w_vals);
        vb_vals = _mm512_fmadd_ps(m_col1_vals, va_x_vals, vb_vals);
        vb_vals = _mm512_fmadd_ps(m_col2_vals, va_y_vals, vb_vals);
        vb_vals = _mm512_fmadd_ps(m_col3_vals, va_z_vals, vb_vals);
        _mm512_store_ps(&vec_b[i], vb_vals);
    }
    if (i < num_vec)
    {
        __m128 m_col0_vals2 = _mm512_extractf32x4_ps(m_col0_vals, 0);
        __m128 m_col1_vals2 = _mm512_extractf32x4_ps(m_col1_vals, 1);
        __m128 m_col2_vals2 = _mm512_extractf32x4_ps(m_col2_vals, 2);
        __m128 m_col3_vals2 = _mm512_extractf32x4_ps(m_col3_vals, 3);
        for (; i < num_vec; i++)
        {
            __m128 va_w_vals = _mm_broadcast_ss(&vec_a[i].W);
            __m128 va_x_vals = _mm_broadcast_ss(&vec_a[i].X);
            __m128 va_y_vals = _mm_broadcast_ss(&vec_a[i].Y);
            __m128 va_z_vals = _mm_broadcast_ss(&vec_a[i].Z);
            __m128 vb_vals = _mm_mul_ps(m_col0_vals2, va_w_vals);
            vb_vals = _mm_fmadd_ps(m_col1_vals2, va_x_vals, vb_vals);
            vb_vals = _mm_fmadd_ps(m_col2_vals2, va_y_vals, vb_vals);
            vb_vals = _mm_fmadd_ps(m_col3_vals2, va_z_vals, vb_vals);
            _mm_store_ps((float*)&vec_b[i], vb_vals);
        }
    }
}
Listing 8-7

Example Ch08_07

The source code in file Ch08_07_fcpp.cpp begins with a series of arrays that contain permutation indices. The AVX-512 implementation of the matrix-vector multiplication algorithm uses these indices to reorder the elements of the source matrix and vectors. The reason for this reordering is to facilitate the calculation of four matrix-vector products during each iteration of the for-loop. The definition of function MatVecMulF32_Cpp() follows the permutation indices. This function calculates matrix-vector (4 × 4, 4 × 1) products using standard C++ statements.

Following argument validation, function MatVecMulF32a_Iavx512() loads the permutation indices using a series of _mm512_load_epi32() calls. The ensuing call to _mm512_load_ps() loads matrix m into m_vals. This is followed by a series of four calls to _mm512_permutexvar_ps() that permute the elements in m_vals to generate four copies of each column in matrix m as shown in Figure 8-2.
Figure 8-2

Permutation of matrix columns using _mm512_permutexvar_ps()

Each iteration of the first for-loop in MatVecMulF32a_Iavx512() begins with a call to _mm512_load_ps() that loads a block of four vectors into va_vals. The next code block employs the C++ SIMD intrinsic function _mm512_permutexvar_ps() to reorder vector components W, X, Y, and Z. Figure 8-3 illustrates this operation in greater detail. Following the permutation, MatVecMulF32a_Iavx512() invokes _mm512_mul_ps() and _mm512_fmadd_ps() to calculate four matrix-vector products. The final call of the for-loop, _mm512_store_ps(), saves the just calculated matrix-vector products. The second for-loop in MatVecMulF32a_Iavx512() calculates any residual matrix-vector products if num_vec is not an integral multiple of num_vec_per_iteration.
Figure 8-3

Permutation of vector components using _mm512_permutexvar_ps()

Not shown in Listing 8-7 is the function MatVecMulF32b_Iavx512(). This function differs slightly from MatVecMulF32a_Iavx512() in that it uses the C++ SIMD intrinsic function _mm512_stream_ps() to save the calculated matrix-vector products. Unlike function _mm512_store_ps(), function _mm512_stream_ps() saves the specified data SIMD value using a nontemporal memory hint. A nontemporal memory hint notifies the processor that the data being saved will not be immediately referenced again. This allows the processor to (optionally) bypass its normal memory cache hierarchy, which minimizes cache pollution and often results in better performance as you will soon see. It is, however, important to note that improper use of _mm512_stream_ps() can result in slower performance. Appendix B contains a list of AMD and Intel programming reference manuals that you can consult for more information about nontemporal memory hints. Here are the results for source code example Ch08_07:
Results for MatrixVecMulF32
Test case #0
vec_b1:      304.0      564.0      824.0     1084.0
vec_b2:      304.0      564.0      824.0     1084.0
vec_b3:      304.0      564.0      824.0     1084.0
Test case #1
vec_b1:      764.0     1424.0     2084.0     2744.0
vec_b2:      764.0     1424.0     2084.0     2744.0
vec_b3:      764.0     1424.0     2084.0     2744.0
Test case #2
vec_b1:     1224.0     2284.0     3344.0     4404.0
vec_b2:     1224.0     2284.0     3344.0     4404.0
vec_b3:     1224.0     2284.0     3344.0     4404.0
Test case #3
vec_b1:     1684.0     3144.0     4604.0     6064.0
vec_b2:     1684.0     3144.0     4604.0     6064.0
vec_b3:     1684.0     3144.0     4604.0     6064.0
Test case #4
vec_b1:    11932.0    22452.0    32972.0    43492.0
vec_b2:    11932.0    22452.0    32972.0    43492.0
vec_b3:    11932.0    22452.0    32972.0    43492.0
Test case #5
vec_b1:    17125.0    31705.0    46285.0    60865.0
vec_b2:    17125.0    31705.0    46285.0    60865.0
vec_b3:    17125.0    31705.0    46285.0    60865.0
Test case #6
vec_b1:    12723.0    23873.0    35023.0    46173.0
vec_b2:    12723.0    23873.0    35023.0    46173.0
vec_b3:    12723.0    23873.0    35023.0    46173.0
Test case #7
vec_b1:    15121.0    27871.0    40621.0    53371.0
vec_b2:    15121.0    27871.0    40621.0    53371.0
vec_b3:    15121.0    27871.0    40621.0    53371.0
Test case #8
vec_b1:    13789.0    26039.0    38289.0    50539.0
vec_b2:    13789.0    26039.0    38289.0    50539.0
vec_b3:    13789.0    26039.0    38289.0    50539.0
Test case #9
vec_b1:     9663.0    17873.0    26083.0    34293.0
vec_b2:     9663.0    17873.0    26083.0    34293.0
vec_b3:     9663.0    17873.0    26083.0    34293.0
Running benchmark function MatrixVecMulF32_bm - please wait
Benchmark times save to file Ch08_07_MatrixVecMulF32_bm_LITHIUM.csv
Table 8-3 shows the benchmark timing measurements for source code example Ch08_07. Note that use of the C++ intrinsic function _mm512_stream_ps() in MatVecMulF32b_Iavx512() yielded an appreciable improvement in performance.
Table 8-3

Matrix-Vector (4 × 4, 4 × 1) Multiplication Execution Times (Microseconds), 1,000,000 Vectors

CPU

MatVecMulF32_Cpp()

MatVecMulF32a_Iavx512()

MatVecMulF32b_Iavx512()

Intel Core i5-11600K

5069

1111

708

Convolutions

In Chapter 6, you learned how to compute 1D and 2D discrete convolutions using C++ intrinsic functions and AVX2. In this section, you will examine two source code examples that illustrate convolutions using AVX-512. Like Chapter 6, the source code examples discussed in this section are somewhat more specialized than those covered in the previous sections. If your SIMD programming interests reside elsewhere, you can either skim this section or skip ahead to the next chapter. If you decide to continue, you may want to review the sections in Chapter 6 that explained the mathematics of a discrete convolution before examining the source code.

1D Convolutions

The next source code example, Ch08_08, implements a 1D discrete convolution using C++ SIMD intrinsic functions and AVX-512. Listing 8-8 shows the calculating code for this source code example, which is a modified version of source code example Ch06_01.
//------------------------------------------------
//               Ch08_08_fcpp.cpp
//------------------------------------------------
#include <stdexcept>
#include <immintrin.h>
#include "Ch08_08.h"
#include "MiscTypes.h"
void Convolve1D_F32_Cpp(std::vector<float>& y, const std::vector<float>& x, const std::vector<float>& kernel)
{
    if (!CheckArgs(y, x, kernel))
        throw std::runtime_error("Convolve1D_F32_Cpp() - CheckArgs failed");
    indx_t num_pts = (indx_t)y.size();
    indx_t ks2 = kernel.size() / 2;
    for (indx_t i = ks2; i < num_pts - ks2; i++)
    {
        float y_val = 0;
        for (indx_t k = -ks2; k <= ks2; k++)
            y_val += x[i - k] * kernel[k + ks2];
        y[i] = y_val;
    }
}
void Convolve1D_F32_Iavx512(std::vector<float>& y, const std::vector<float>& x, const std::vector<float>& kernel)
{
    if (!CheckArgs(y, x, kernel))
        throw std::runtime_error("Convolve1D_F32_Iavx512() - CheckArgs failed");
    indx_t ks2 = (indx_t)kernel.size() / 2;
    indx_t num_pts = (indx_t)y.size();
    const indx_t num_simd_elements = 16;
    const indx_t num_simd_elements2 = 8;
    const indx_t num_simd_elements3 = 4;
    indx_t i = ks2;
    while (i < num_pts - ks2)
    {
        if ((i + num_simd_elements) <= num_pts - ks2)
        {
             __m512 y_vals = _mm512_setzero_ps();
            for (indx_t k = -ks2; k <= ks2; k++)
            {
                __m512 x_vals = _mm512_loadu_ps(&x[i - k]);
                __m512 kernel_vals = _mm512_set1_ps(kernel[k + ks2]);
                y_vals = _mm512_fmadd_ps(x_vals, kernel_vals, y_vals);
            }
            _mm512_storeu_ps(&y[i], y_vals);
            i += num_simd_elements;
        }
        else if ((i + num_simd_elements2) <= num_pts - ks2)
        {
             __m256 y_vals = _mm256_setzero_ps();
            for (indx_t k = -ks2; k <= ks2; k++)
            {
                __m256 x_vals = _mm256_loadu_ps(&x[i - k]);
                __m256 kernel_vals = _mm256_set1_ps(kernel[k + ks2]);
                y_vals = _mm256_fmadd_ps(x_vals, kernel_vals, y_vals);
            }
            _mm256_storeu_ps(&y[i], y_vals);
            i += num_simd_elements2;
        }
        else if ((i + num_simd_elements3) <= num_pts - ks2)
        {
             __m128 y_vals = _mm_setzero_ps();
            for (indx_t k = -ks2; k <= ks2; k++)
            {
                __m128 x_vals = _mm_loadu_ps(&x[i - k]);
                __m128 kernel_vals = _mm_set1_ps(kernel[k + ks2]);
                y_vals = _mm_fmadd_ps(x_vals, kernel_vals, y_vals);
            }
            _mm_storeu_ps(&y[i], y_vals);
            i += num_simd_elements3;
        }
        else
        {
             __m128 y_val = _mm_setzero_ps();
            for (indx_t k = -ks2; k <= ks2; k++)
            {
                __m128 x_val = _mm_load_ss(&x[i - k]);
                __m128 k_val = _mm_load_ss(&kernel[k + ks2]);
                y_val = _mm_fmadd_ss(x_val, k_val, y_val);
            }
            _mm_store_ss(&y[i], y_val);
            i += 1;
        }
    }
}
void Convolve1DKs5_F32_Iavx512(std::vector<float>& y, const std::vector<float>& x, const std::vector<float>& kernel)
{
    if (!CheckArgs(y, x, kernel))
        throw std::runtime_error("Convolve1DKs5_F32_Iavx2() - CheckArgs failed");
    if (kernel.size() != 5)
        throw std::runtime_error("Convolve1DKs5_F32_Iavx2() - invalid kernel size");
    indx_t ks2 = (indx_t)kernel.size() / 2;
    indx_t num_pts = (indx_t)y.size();
    const indx_t num_simd_elements = 16; // number of F32 elements (__m512)
    const indx_t num_simd_elements2 = 8; // number of F32 elements (__m256)
    const indx_t num_simd_elements3 = 4; // number of F32 elements (__m128)
    __m512 kernel512_0 = _mm512_set1_ps(kernel[0]);
    __m512 kernel512_1 = _mm512_set1_ps(kernel[1]);
    __m512 kernel512_2 = _mm512_set1_ps(kernel[2]);
    __m512 kernel512_3 = _mm512_set1_ps(kernel[3]);
    __m512 kernel512_4 = _mm512_set1_ps(kernel[4]);
    __m256 kernel256_0 = _mm256_set1_ps(kernel[0]);
    __m256 kernel256_1 = _mm256_set1_ps(kernel[1]);
    __m256 kernel256_2 = _mm256_set1_ps(kernel[2]);
    __m256 kernel256_3 = _mm256_set1_ps(kernel[3]);
    __m256 kernel256_4 = _mm256_set1_ps(kernel[4]);
    __m128 kernel128_0 = _mm_set1_ps(kernel[0]);
    __m128 kernel128_1 = _mm_set1_ps(kernel[1]);
    __m128 kernel128_2 = _mm_set1_ps(kernel[2]);
    __m128 kernel128_3 = _mm_set1_ps(kernel[3]);
    __m128 kernel128_4 = _mm_set1_ps(kernel[4]);
    indx_t i = ks2;
    while (i < num_pts - ks2)
    {
        indx_t j = i + ks2;
        if ((i + num_simd_elements) <= num_pts - ks2)
        {
            __m512 x_vals = _mm512_loadu_ps(&x[j]);
            __m512 y_vals = _mm512_mul_ps(x_vals, kernel512_0);
            x_vals = _mm512_loadu_ps(&x[j - 1]);
            y_vals = _mm512_fmadd_ps(x_vals, kernel512_1, y_vals);
            x_vals = _mm512_loadu_ps(&x[j - 2]);
            y_vals = _mm512_fmadd_ps(x_vals, kernel512_2, y_vals);
            x_vals = _mm512_loadu_ps(&x[j - 3]);
            y_vals = _mm512_fmadd_ps(x_vals, kernel512_3, y_vals);
            x_vals = _mm512_loadu_ps(&x[j - 4]);
            y_vals = _mm512_fmadd_ps(x_vals, kernel512_4, y_vals);
            _mm512_storeu_ps(&y[i], y_vals);
            i += num_simd_elements;
        }
        else if ((i + num_simd_elements2) <= num_pts - ks2)
        {
            __m256 x_vals = _mm256_loadu_ps(&x[j]);
            __m256 y_vals = _mm256_mul_ps(x_vals, kernel256_0);
            x_vals = _mm256_loadu_ps(&x[j - 1]);
            y_vals = _mm256_fmadd_ps(x_vals, kernel256_1, y_vals);
            x_vals = _mm256_loadu_ps(&x[j - 2]);
            y_vals = _mm256_fmadd_ps(x_vals, kernel256_2, y_vals);
            x_vals = _mm256_loadu_ps(&x[j - 3]);
            y_vals = _mm256_fmadd_ps(x_vals, kernel256_3, y_vals);
            x_vals = _mm256_loadu_ps(&x[j - 4]);
            y_vals = _mm256_fmadd_ps(x_vals, kernel256_4, y_vals);
            _mm256_storeu_ps(&y[i], y_vals);
            i += num_simd_elements2;
        }
        else if ((i + num_simd_elements3) <= num_pts - ks2)
        {
            __m128 x_vals = _mm_loadu_ps(&x[j]);
            __m128 y_vals = _mm_mul_ps(x_vals, kernel128_0);
            x_vals = _mm_loadu_ps(&x[j - 1]);
            y_vals = _mm_fmadd_ps(x_vals, kernel128_1, y_vals);
            x_vals = _mm_loadu_ps(&x[j - 2]);
            y_vals = _mm_fmadd_ps(x_vals, kernel128_2, y_vals);
            x_vals = _mm_loadu_ps(&x[j - 3]);
            y_vals = _mm_fmadd_ps(x_vals, kernel128_3, y_vals);
            x_vals = _mm_loadu_ps(&x[j - 4]);
            y_vals = _mm_fmadd_ps(x_vals, kernel128_4, y_vals);
            _mm_storeu_ps(&y[i], y_vals);
            i += num_simd_elements3;
        }
        else
        {
            __m128 x_val = _mm_load_ss(&x[j]);
            __m128 y_val = _mm_mul_ss(x_val, kernel128_0);
            x_val = _mm_load_ss(&x[j - 1]);
            y_val = _mm_fmadd_ss(x_val, kernel128_1, y_val);
            x_val = _mm_load_ss(&x[j - 2]);
            y_val = _mm_fmadd_ss(x_val, kernel128_2, y_val);
            x_val = _mm_load_ss(&x[j - 3]);
            y_val = _mm_fmadd_ss(x_val, kernel128_3, y_val);
            x_val = _mm_load_ss(&x[j - 4]);
            y_val = _mm_fmadd_ss(x_val, kernel128_4, y_val);
            _mm_store_ss(&y[i], y_val);
            i += 1;
        }
    }
}
Listing 8-8

Example Ch08_08

The first function in Listing 8-8, Convolve1D_F32_Cpp(), implements a 1D discrete convolution using standard C++ statements. This function is identical to the one you saw in source code example Ch06_01 and is included again here for benchmarking purposes. The next function in Listing 8-8, named Convolve1D_F32_Iavx512(), uses AVX-512 C++ SIMD intrinsic functions to implement a 1D discrete convolution. This function is similar to the function Convolve1D_F32_Iavx2() that was presented in source example Ch06_01. The primary difference is that Convolve1D_F32_Iavx512() includes an extra code block near the top of the while-loop that processes signal elements y[i:i+15] using __m512 data types and the following C++ SIMD intrinsic functions: _mm512_loadu_ps(), _mm512_set1_ps(), _mm512_fmadd_ps(), and _mm512_storeu_ps(). The other code blocks in the while-loop process signal elements y[i:i+7], y[i:i+3], or y[i] using C++ SIMD intrinsic functions and data types just like function Convolve1D_F32_Iavx2() did in example Ch06_01.

Following Convolve1D_F32_Iavx512() in Listing 8-8 is the function Convolve1DKs5_F32_Iavx512(). This function implements a 1D discrete convolution using AVX-512 and is optimized for a five-element convolution kernel. Recall from the discussions in Chapter 6 that many real-world signal processing applications frequently employ size-optimized convolution functions since they are often faster than their variable-width counterparts. Note that the principal modification between the code in Convolve1DKs5_F32_Iavx512() and the Ch06_01 function Convolve1DKs5_F32_Iavx2() is that the former includes a code block near the top of the while-loop that processes signal elements y[i:i+15] using _m512 data types and AVX-512 C++ SIMD intrinsic functions.

Source code example Ch08_08 exemplifies that it is often a straightforward programming exercise to port an AVX2 calculating function to one that can exploit the additional computational resources of AVX-512. Here are the results for source code example Ch08_08:
Executing Convolve1D_F32()
Results saved to file Ch08_08_Convolve1D_F32_Results_LITHIUM.csv
Running benchmark function Convolve1D_F32_bm - please wait
Benchmark times saved to file Ch08_08_Convolve1D_F32_bm_LITHIUM.csv
Table 8-4. shows some benchmark timing measurements for source code example Ch08_08. These measurements were made using a 1,000,000-element input signal array and a five-element convolution kernel.
Table 8-4

1D Discrete Convolution (Single-Precision) Execution Times (Microseconds)

CPU

Convolve1D_F32_Cpp()

Convolve1D_F32_Iavx512()

Convolve1DKs5_F32_Iavx512

Intel Core i5-11600K

2268

242

200

2D Convolutions

The final source code example of this chapter, Ch08_09, demonstrates how to employ AVX-512 C++ SIMD intrinsic functions to calculate a 2D discrete convolution. This example is a modified version of source code example Ch06_04, which carried out a 2D discrete convolution using two 1D discrete convolutions. Listing 8-9 shows the calculating code for example Ch08_09. Before examining the code in Listing 8-9, you may want to review the definition of structure CD_1Dx2 that is shown in Listing 6-4 since the same structure is used in example Ch08_09.
//------------------------------------------------
//               Ch08_09_fcpp.cpp
//------------------------------------------------
#include <stdexcept>
#include <immintrin.h>
#include "Ch08_09.h"
#include "MiscTypes.h"
void Convolve1Dx2_F32_Cpp(CD_1Dx2& cd)
{
    if (!CheckArgs1Dx2(cd))
        throw std::runtime_error("Convolve1Dx2_F32_Cpp() - CheckArgs failed");
    indx_t ks = (indx_t)cd.m_KernelSize;
    indx_t ks2 = ks / 2;
    indx_t im_h = cd.m_ImH;
    indx_t im_w = cd.m_ImW;
    const std::vector<float>& im_src = cd.m_ImSrc;
    std::vector<float>& im_des = cd.m_ImDes;
    std::vector<float>& im_tmp = cd.m_ImTmp;
    const std::vector<float>& im_ker_x = cd.m_Kernel1Dx;
    const std::vector<float>& im_ker_y = cd.m_Kernel1Dy;
    // Perform 1D convolution (X)
    for (indx_t i = ks2; i < im_h - ks2; i++)
    {
        for (indx_t j = ks2; j < im_w - ks2; j++)
        {
            float im_tmp_val = 0;
            for (indx_t k = -ks2; k <= ks2; k++)
                im_tmp_val += im_src[i * im_w + j - k] * im_ker_x[k + ks2];
            im_tmp[i * im_w + j] = im_tmp_val;
        }
    }
    // Perform 1D convolution (Y)
    for (indx_t j = ks2; j < im_w - ks2; j++)
    {
        for (indx_t i = ks2; i < im_h - ks2; i++)
        {
            float im_des_val = 0;
            for (indx_t k = -ks2; k <= ks2; k++)
                im_des_val += im_tmp[(i - k) * im_w + j] * im_ker_y[k + ks2];
            im_des[i * im_w + j] = im_des_val;
        }
    }
}
void Convolve1Dx2_F32_Iavx512(CD_1Dx2& cd)
{
    if (!CheckArgs1Dx2(cd))
        throw std::runtime_error("Convolve1Dx2_F32_Iavx512() - CheckArgs failed");
    indx_t ks = (indx_t)cd.m_KernelSize;
    indx_t ks2 = ks / 2;
    indx_t im_h = cd.m_ImH;
    indx_t im_w = cd.m_ImW;
    const std::vector<float>& im_src = cd.m_ImSrc;
    std::vector<float>& im_des = cd.m_ImDes;
    std::vector<float>& im_tmp = cd.m_ImTmp;
    const std::vector<float>& im_ker_x = cd.m_Kernel1Dx;
    const std::vector<float>& im_ker_y = cd.m_Kernel1Dy;
    const indx_t num_simd_elements = 16;
    const indx_t num_simd_elements2 = 8;
    const indx_t num_simd_elements3 = 4;
    // Perform 1D convolution (X)
    for (indx_t i = ks2; i < im_h - ks2; i++)
    {
        indx_t j = ks2;
        while (j < im_w - ks2)
        {
            if (j + num_simd_elements <= im_w - ks2)
            {
                __m512 im_tmp_vals = _mm512_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m512 im_src_vals = _mm512_loadu_ps(&im_src[i * im_w + j - k]);
                    __m512 im_ker_vals = _mm512_set1_ps(im_ker_x[k + ks2]);
                    im_tmp_vals = _mm512_fmadd_ps(im_src_vals, im_ker_vals,
                                    im_tmp_vals);
                }
                _mm512_storeu_ps(&im_tmp[i * im_w + j], im_tmp_vals);
                j += num_simd_elements;
            }
            else if (j + num_simd_elements2 <= im_w - ks2)
            {
                __m256 im_tmp_vals = _mm256_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m256 im_src_vals = _mm256_loadu_ps(&im_src[i * im_w + j - k]);
                    __m256 im_ker_vals = _mm256_set1_ps(im_ker_x[k + ks2]);
                    im_tmp_vals = _mm256_fmadd_ps(im_src_vals, im_ker_vals,
                                    im_tmp_vals);
                }
                _mm256_storeu_ps(&im_tmp[i * im_w + j], im_tmp_vals);
                j += num_simd_elements2;
            }
            else if (j + num_simd_elements3 <= im_w - ks2)
            {
                __m128 im_tmp_vals = _mm_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m128 im_src_vals = _mm_loadu_ps(&im_src[i * im_w + j - k]);
                    __m128 im_ker_vals = _mm_set1_ps(im_ker_x[k + ks2]);
                    im_tmp_vals = _mm_fmadd_ps(im_src_vals, im_ker_vals,
                                    im_tmp_vals);
                }
                _mm_storeu_ps(&im_tmp[i * im_w + j], im_tmp_vals);
                j += num_simd_elements3;
            }
            else
            {
                __m128 im_tmp_vals = _mm_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m128 im_src_vals = _mm_load_ss(&im_src[i * im_w + j - k]);
                    __m128 im_ker_vals = _mm_load_ss(&im_ker_x[k + ks2]);
                    im_tmp_vals = _mm_fmadd_ss(im_src_vals, im_ker_vals,
                                    im_tmp_vals);
                }
                _mm_store_ss(&im_tmp[i * im_w + j], im_tmp_vals);
                j += 1;
            }
        }
    }
    // Perform 1D convolution (Y)
    indx_t j = ks2;
    while (j < im_w - ks2)
    {
        if (j + num_simd_elements <= im_w - ks2)
        {
            for (indx_t i = ks2; i < im_h - ks2; i++)
            {
                __m512 im_des_vals = _mm512_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m512 im_tmp_vals = _mm512_loadu_ps(&im_tmp[(i - k) * im_w + j]);
                    __m512 im_ker_vals = _mm512_set1_ps(im_ker_y[k + ks2]);
                    im_des_vals = _mm512_fmadd_ps(im_tmp_vals, im_ker_vals,
                                    im_des_vals);
                }
                _mm512_storeu_ps(&im_des[i * im_w + j], im_des_vals);
             }
            j += num_simd_elements;
        }
        else if (j + num_simd_elements2 <= im_w - ks2)
        {
            for (indx_t i = ks2; i < im_h - ks2; i++)
            {
                __m256 im_des_vals = _mm256_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m256 im_tmp_vals = _mm256_loadu_ps(&im_tmp[(i - k) * im_w + j]);
                    __m256 im_ker_vals = _mm256_set1_ps(im_ker_y[k + ks2]);
                    im_des_vals = _mm256_fmadd_ps(im_tmp_vals, im_ker_vals,
                                    im_des_vals);
                }
                _mm256_storeu_ps(&im_des[i * im_w + j], im_des_vals);
             }
            j += num_simd_elements2;
        }
        else if (j + num_simd_elements3 <= im_w - ks2)
        {
            for (indx_t i = ks2; i < im_h - ks2; i++)
            {
                __m128 im_des_vals = _mm_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m128 im_tmp_vals = _mm_loadu_ps(&im_tmp[(i - k) * im_w + j]);
                    __m128 im_ker_vals = _mm_set1_ps(im_ker_y[k + ks2]);
                    im_des_vals = _mm_fmadd_ps(im_tmp_vals, im_ker_vals,
                                    im_des_vals);
                }
                _mm_storeu_ps(&im_des[i * im_w + j], im_des_vals);
             }
            j += num_simd_elements3;
        }
        else
        {
            for (indx_t i = ks2; i < im_h - ks2; i++)
            {
                __m128 im_des_vals = _mm_setzero_ps();
                for (indx_t k = -ks2; k <= ks2; k++)
                {
                    __m128 im_tmp_vals = _mm_load_ss(&im_tmp[(i - k) * im_w + j]);
                    __m128 im_ker_vals = _mm_load_ss(&im_ker_y[k + ks2]);
                    im_des_vals = _mm_fmadd_ss(im_tmp_vals, im_ker_vals,
                                    im_des_vals);
                }
                _mm_store_ss(&im_des[i * im_w + j], im_des_vals);
            }
            j += 1;
        }
    }
}
Listing 8-9

Example Ch08_09

The source code for file Ch08_09_fcpp.cpp that is shown in Listing 8-9 is somewhat lengthy but (hopefully) relatively straightforward to comprehend. It begins with the function Convolve1Dx2_F32_Cpp(), which implements a 2D discrete convolution using standard C++ statements. This function is identical to the one you studied in source code example Ch06_04 and is included again in this example for benchmarking purposes.

Also shown in Listing 8-9 is the SIMD calculating function Convolve1Dx2_F32_Iavx512(). This function, which is a modified version of function Convolve1Dx2_F32_Iavx2() (see Listing 6-4), performs a 2D discrete convolution using AVX-512 C++ SIMD intrinsic functions. In function Convolve1Dx2_F32_Iavx512(), note the inclusion of an extra if block in the x-axis section that processes image pixels using __m512 data types and the following C++ SIMD intrinsic functions: _mm512_loadu_ps(), _mm512_set1_ps(), _mm512_fmadd_ps(), and _mm512_storeu_ps(). A similar if block was also added to the y-axis section of Convolve1Dx2_F32_Iavx512().

Source code example Ch08_09 exemplifies again that with a little forethought, it is often relatively easy to transform a calculating function that employs AVX2 to one that exploits AVX-512. When developing SIMD calculating functions that perform calculations using AVX or AVX2, you should always keep in mind that you may want to create an AVX-512 variant sometime in the future. Here are the results for source code example Ch08_09:
Performing convolutions
Saving destination image files
rc:       true
num_diff: 0
Running benchmark function Convolve1Dx2_F32_bm - please wait
..................................................
Benchmark times saved to file Ch08_09_Convolve1Dx2_F32_bm_LITHIUM.csv
Table 8-5 shows some benchmark timing measurements for source code example Ch08_09. These measurements were made using test image ImageE.png and a 9 × 9 convolution kernel that performs low-pass filtering.
Table 8-5

2D Discrete Convolution (Single-Precision) Execution Times (Microseconds)

CPU

Convolve1Dx2_F32_Cpp()

Convolve1Dx2_F32_Iavx512()

Intel Core i5-11600K

14373

2065

Summary

Table 8-6 summarizes the C++ SIMD intrinsic functions that were introduced in this chapter. This table also includes commonly used size variants. AVX-512 C++ SIMD intrinsic functions that perform merge (_mask_) masking and zero (_maskz_) masking are also available for most of the functions listed in Table 8-6, but these are not shown. Before proceeding to the next chapter, you should understand the SIMD arithmetic calculation or data manipulation operation that is performed by each function shown in Table 8-6.
Table 8-6

C++ SIMD Intrinsic Function Summary for Chapter 8

C++ SIMD Function Name

Description

_mm256_insertf64x2

Insert double-precision elements

_mm512_abs_pd, _ps

Packed floating-point absolute value

_mm512_add_pd, _ps

Packed floating-point addition

_mm512_cmp_pd_mask, _ps_mask

Packed floating-point compare

_mm512_div_pd, _ps

Packed floating-point division

_mm512_extractf32x4_ps, f32x8_ps

Extract floating-point elements

_mm512_fmadd_pd, ps

Packed floating-point fused-multiple-add

_mm512_insertf64x2, f64x4

Insert double-precision elements

_mm512_load_epi8, _epi16, _epi32, _epi64

Load packed integer elements

_mm512_load_pd, _ps

Load (aligned) floating-point elements

_mm512_loadu_pd, _ps

Load (unaligned) floating-point elements

_mm512_max_pd, _ps

Packed floating-point maximum

_mm512_min_pd, _ps

Packed floating-point minimum

_mm512_permutexvar_pd, _ps

Permute floating-point elements

_mm512_reduce_add_pd, _ps

Reduce (sum) floating-point elements

_mm512_set1_pd, _ps

Broadcast floating-point value to all elements

_mm512_setzero_pd, _ps

Set floating-point elements to zero

_mm512_sqrt_pd, _ps

Packed floating-point square root

_mm512_store_pd, _ps

Store (aligned) floating-point elements

_mm512_storeu_pd, _ps

Store (unaligned) floating-point elements

_mm512_stream_pd, _ps

Store (nontemporal) floating-point elements

_mm512_sub_pd, _ps

Packed floating-point subtraction

_mm_stream_pd, _ps

Store (nontemporal) floating-point elements

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
18.223.196.59