Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

D. KusswurmModern Parallel Programming with C++ and Assembly Languagehttps://doi.org/10.1007/978-1-4842-7918-2_8

8. AVX-512 C++ Programming: Part 2

Daniel Kusswurm¹

(1)

Geneva, IL, USA

In earlier chapters, you studied a variety of source code examples that demonstrated floating-point calculations and algorithms using AVX and AVX2 C++ SIMD intrinsic functions. In this chapter, you will examine similar source code examples that highlight the use of AVX-512 C++ SIMD intrinsic functions that perform floating-point operations. The first section contains two source code examples that illustrate simple floating-point arithmetic using 512-bit wide operands. The next two sections focus on using AVX-512 to perform computations with floating-point arrays and matrices. The final section explains how to perform discrete convolutions using AVX-512.

Floating-Point Arithmetic

In this section, you will learn how to perform elementary floating-point arithmetic using AVX-512 C++ SIMD intrinsic functions. You will also learn how to carry out merge masking and zero masking using floating-point operands.

Basic Arithmetic

Listing 8-1 contains the source code for example Ch08_01. This example demonstrates basic arithmetic operations using 512-bit wide packed floating-point operands.

//------------------------------------------------

// Ch08_01.h

//------------------------------------------------

#pragma once

#include "ZmmVal.h"

// Ch08_01_fcpp.cpp

extern void PackedMathF32_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b);

extern void PackedMathF64_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b);

//------------------------------------------------

// Ch08_01.cpp

//------------------------------------------------

#include <iostream>

#define _USE_MATH_DEFINES

#include <math.h>

#include "Ch08_01.h"

static void PackedMathF32(void);

static void PackedMathF64(void);

int main()

{

PackedMathF32();

PackedMathF64();

}

static void PackedMathF32(void)

{

ZmmVal a, b, c[8];

const char nl = ' ';

a.m_F32[0] = 36.0f; b.m_F32[0] = -0.1111111f;

a.m_F32[1] = 0.03125f; b.m_F32[1] = 64.0f;

a.m_F32[2] = 2.0f; b.m_F32[2] = -0.0625f;

a.m_F32[3] = 42.0f; b.m_F32[3] = 8.666667f;

a.m_F32[4] = 7.0f; b.m_F32[4] = -18.125f;

a.m_F32[5] = 20.5f; b.m_F32[5] = 56.0f;

a.m_F32[6] = 36.125f; b.m_F32[6] = 24.0f;

a.m_F32[7] = 0.5f; b.m_F32[7] = -158.6f;

a.m_F32[8] = 136.0f; b.m_F32[8] = -9.1111111f;

a.m_F32[9] = 2.03125f; b.m_F32[9] = 864.0f;

a.m_F32[10] = 32.0f; b.m_F32[10] = -70.0625f;

a.m_F32[11] = 442.0f; b.m_F32[11] = 98.666667f;

a.m_F32[12] = 57.0f; b.m_F32[12] = -518.125f;

a.m_F32[13] = 620.5f; b.m_F32[13] = 456.0f;

a.m_F32[14] = 736.125f; b.m_F32[14] = 324.0f;

a.m_F32[15] = 80.5f; b.m_F32[15] = -298.6f;

PackedMathF32_Iavx512(c, &a, &b);

std::cout << (" Results for PackedMathF32 ");

for (unsigned int i = 0; i < 4; i++)

{

std::cout << "Group #" << i << nl;

std::cout << " a: " << a.ToStringF32(i) << nl;

std::cout << " b: " << b.ToStringF32(i) << nl;

std::cout << " addps: " << c[0].ToStringF32(i) << nl;

std::cout << " subps: " << c[1].ToStringF32(i) << nl;

std::cout << " mulps: " << c[2].ToStringF32(i) << nl;

std::cout << " divps: " << c[3].ToStringF32(i) << nl;

std::cout << " minps: " << c[4].ToStringF32(i) << nl;

std::cout << " maxps: " << c[5].ToStringF32(i) << nl;

std::cout << " sqrtps: " << c[6].ToStringF32(i) << nl;

std::cout << " absps: " << c[7].ToStringF32(i) << nl;

std::cout << nl;

}

static void PackedMathF64(void)

{

ZmmVal a, b, c[8];

const char nl = ' ';

a.m_F64[0] = 2.0; b.m_F64[0] = M_PI;

a.m_F64[1] = 4.875; b.m_F64[1] = M_E;

a.m_F64[2] = 7.5; b.m_F64[2] = -9.125;

a.m_F64[3] = 3.0; b.m_F64[3] = -M_PI;

a.m_F64[4] = 12.3333; b.m_F64[4] = M_PI / 2;

a.m_F64[5] = 24.0; b.m_F64[5] = M_E / 2;

a.m_F64[6] = 37.5; b.m_F64[6] = -9.125 / 2;

a.m_F64[7] = 43.0; b.m_F64[7] = -M_PI / 2;

PackedMathF64_Iavx512(c, &a, &b);

std::cout << (" Results for PackedMathF64 ");

for (unsigned int i = 0; i < 4; i++)

{

std::cout << "Group #" << i << nl;

std::cout << " a: " << a.ToStringF64(i) << nl;

std::cout << " b: " << b.ToStringF64(i) << nl;

std::cout << " addpd: " << c[0].ToStringF64(i) << nl;

std::cout << " subpd: " << c[1].ToStringF64(i) << nl;

std::cout << " mulpd: " << c[2].ToStringF64(i) << nl;

std::cout << " divpd: " << c[3].ToStringF64(i) << nl;

std::cout << " minpd: " << c[4].ToStringF64(i) << nl;

std::cout << " maxpd: " << c[5].ToStringF64(i) << nl;

std::cout << " sqrtpd: " << c[6].ToStringF64(i) << nl;

std::cout << " abspd: " << c[7].ToStringF64(i) << nl;

std::cout << nl;

}

//------------------------------------------------

// Ch08_01_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch08_01.h"

void PackedMathF32_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b)

{

__m512 a_vals = _mm512_load_ps(a);

__m512 b_vals = _mm512_load_ps(b);

__m512 c_vals0 = _mm512_add_ps(a_vals, b_vals);

__m512 c_vals1 = _mm512_sub_ps(a_vals, b_vals);

__m512 c_vals2 = _mm512_mul_ps(a_vals, b_vals);

__m512 c_vals3 = _mm512_div_ps(a_vals, b_vals);

__m512 c_vals4 = _mm512_min_ps(a_vals, b_vals);

__m512 c_vals5 = _mm512_max_ps(a_vals, b_vals);

__m512 c_vals6 = _mm512_sqrt_ps(a_vals);

__m512 c_vals7 = _mm512_abs_ps(b_vals);

_mm512_store_ps(&c[0], c_vals0);

_mm512_store_ps(&c[1], c_vals1);

_mm512_store_ps(&c[2], c_vals2);

_mm512_store_ps(&c[3], c_vals3);

_mm512_store_ps(&c[4], c_vals4);

_mm512_store_ps(&c[5], c_vals5);

_mm512_store_ps(&c[6], c_vals6);

_mm512_store_ps(&c[7], c_vals7);

}

void PackedMathF64_Iavx512(ZmmVal c[8], const ZmmVal* a, const ZmmVal* b)

{

__m512d a_vals = _mm512_load_pd(a);

__m512d b_vals = _mm512_load_pd(b);

__m512d c_vals0 = _mm512_add_pd(a_vals, b_vals);

__m512d c_vals1 = _mm512_sub_pd(a_vals, b_vals);

__m512d c_vals2 = _mm512_mul_pd(a_vals, b_vals);

__m512d c_vals3 = _mm512_div_pd(a_vals, b_vals);

__m512d c_vals4 = _mm512_min_pd(a_vals, b_vals);

__m512d c_vals5 = _mm512_max_pd(a_vals, b_vals);

__m512d c_vals6 = _mm512_sqrt_pd(a_vals);

__m512d c_vals7 = _mm512_abs_pd(b_vals);

_mm512_store_pd(&c[0], c_vals0);

_mm512_store_pd(&c[1], c_vals1);

_mm512_store_pd(&c[2], c_vals2);

_mm512_store_pd(&c[3], c_vals3);

_mm512_store_pd(&c[4], c_vals4);

_mm512_store_pd(&c[5], c_vals5);

_mm512_store_pd(&c[6], c_vals6);

_mm512_store_pd(&c[7], c_vals7);

}

Listing 8-1

Example Ch08_01

Toward the top of Listing 8-1 are the function declarations for example Ch08_01. Note that these declarations use the ZmmVal structure that you learned about in Chapter 7. The file Ch08_01.cpp contains two functions named PackedMathF32() and PackedMathF64(). These functions perform test case initialization for the SIMD calculating functions PackedMathF32_Iavx512() and PackedMathF64_Iavx512(). They also stream results to std::cout.

The file Ch08_01_fcpp.cpp begins with the definition of function PackedMathF32_Iavx512(). This function uses the C++ SIMD intrinsic function _mm512_load_ps() to initialize a_vals and b_vals. The next code block consists of C++ SIMD intrinsic function calls that perform various AVX-512 arithmetic operations using packed single-precision floating-point operands. This is followed by a series of _mm512_store_ps() calls that save the calculated results. Note that both _mm512_load_ps() and _mm512_store_ps() require their memory operands to be aligned on a 64-byte boundary.

Function PackedMathF64_Iavx512() is the double-precision floating-point counterpart of PackedMathF32_Iavx512(). Note that this function uses the double-precision (_pd) variants of the same C++ SIMD intrinsic functions. Here are the results for source code example Ch08_01:

Results for PackedMathF32

Group #0

a: 36.000000 0.031250 | 2.000000 42.000000

b: -0.111111 64.000000 | -0.062500 8.666667

addps: 35.888889 64.031250 | 1.937500 50.666668

subps: 36.111111 -63.968750 | 2.062500 33.333332

mulps: -4.000000 2.000000 | -0.125000 364.000000

divps: -324.000031 0.000488 | -32.000000 4.846154

minps: -0.111111 0.031250 | -0.062500 8.666667

maxps: 36.000000 64.000000 | 2.000000 42.000000

sqrtps: 6.000000 0.176777 | 1.414214 6.480741

absps: 0.111111 64.000000 | 0.062500 8.666667

Group #1

a: 7.000000 20.500000 | 36.125000 0.500000

b: -18.125000 56.000000 | 24.000000 -158.600006

addps: -11.125000 76.500000 | 60.125000 -158.100006

subps: 25.125000 -35.500000 | 12.125000 159.100006

mulps: -126.875000 1148.000000 | 867.000000 -79.300003

divps: -0.386207 0.366071 | 1.505208 -0.003153

minps: -18.125000 20.500000 | 24.000000 -158.600006

maxps: 7.000000 56.000000 | 36.125000 0.500000

sqrtps: 2.645751 4.527693 | 6.010407 0.707107

absps: 18.125000 56.000000 | 24.000000 158.600006

Group #2

a: 136.000000 2.031250 | 32.000000 442.000000

b: -9.111111 864.000000 | -70.062500 98.666664

addps: 126.888885 866.031250 | -38.062500 540.666687

subps: 145.111115 -861.968750 | 102.062500 343.333344

mulps: -1239.111084 1755.000000 | -2242.000000 43610.664062

divps: -14.926830 0.002351 | -0.456735 4.479730

minps: -9.111111 2.031250 | -70.062500 98.666664

maxps: 136.000000 864.000000 | 32.000000 442.000000

sqrtps: 11.661903 1.425219 | 5.656854 21.023796

absps: 9.111111 864.000000 | 70.062500 98.666664

Group #3

a: 57.000000 620.500000 | 736.125000 80.500000

b: -518.125000 456.000000 | 324.000000 -298.600006

addps: -461.125000 1076.500000 | 1060.125000 -218.100006

subps: 575.125000 164.500000 | 412.125000 379.100006

mulps: -29533.125000 282948.000000 | 238504.500000 -24037.300781

divps: -0.110012 1.360746 | 2.271991 -0.269591

minps: -518.125000 456.000000 | 324.000000 -298.600006

maxps: 57.000000 620.500000 | 736.125000 80.500000

sqrtps: 7.549834 24.909838 | 27.131624 8.972179

absps: 518.125000 456.000000 | 324.000000 298.600006

Results for PackedMathF64

Group #0

a: 2.000000000000 | 4.875000000000

b: 3.141592653590 | 2.718281828459

addpd: 5.141592653590 | 7.593281828459

subpd: -1.141592653590 | 2.156718171541

mulpd: 6.283185307180 | 13.251623913738

divpd: 0.636619772368 | 1.793412275711

minpd: 2.000000000000 | 2.718281828459

maxpd: 3.141592653590 | 4.875000000000

sqrtpd: 1.414213562373 | 2.207940216582

abspd: 3.141592653590 | 2.718281828459

Group #1

a: 7.500000000000 | 3.000000000000

b: -9.125000000000 | -3.141592653590

addpd: -1.625000000000 | -0.141592653590

subpd: 16.625000000000 | 6.141592653590

mulpd: -68.437500000000 | -9.424777960769

divpd: -0.821917808219 | -0.954929658551

minpd: -9.125000000000 | -3.141592653590

maxpd: 7.500000000000 | 3.000000000000

sqrtpd: 2.738612787526 | 1.732050807569

abspd: 9.125000000000 | 3.141592653590

Group #2

a: 12.333300000000 | 24.000000000000

b: 1.570796326795 | 1.359140914230

addpd: 13.904096326795 | 25.359140914230

subpd: 10.762503673205 | 22.640859085770

mulpd: 19.373102337259 | 32.619381941509

divpd: 7.851622638541 | 17.658213176229

minpd: 1.570796326795 | 1.359140914230

maxpd: 12.333300000000 | 24.000000000000

sqrtpd: 3.511879838491 | 4.898979485566

abspd: 1.570796326795 | 1.359140914230

Group #3

a: 37.500000000000 | 43.000000000000

b: -4.562500000000 | -1.570796326795

addpd: 32.937500000000 | 41.429203673205

subpd: 42.062500000000 | 44.570796326795

mulpd: -171.093750000000 | -67.544242052181

divpd: -8.219178082192 | -27.374650211806

minpd: -4.562500000000 | -1.570796326795

maxpd: 37.500000000000 | 43.000000000000

sqrtpd: 6.123724356958 | 6.557438524302

abspd: 4.562500000000 | 1.570796326795

Compare Operations

The next source code example is named Ch08_02. This example demonstrates packed floating-point compare operations using 512-bit wide SIMD operands. It also highlights mask merging operations using floating-point elements. Listing 8-2 shows the source code for example Ch08_02.

//------------------------------------------------

// Ch08_02.h

//------------------------------------------------

#pragma once

#include <cstdint>

#include "ZmmVal.h"

// Ch08_02_fcpp.cpp

extern void PackedCompareF32_Iavx512(uint16_t c[8], const ZmmVal* a, const ZmmVal* b);

extern void PackedCompareF64_Iavx512(ZmmVal* c, const ZmmVal* a, const ZmmVal* b,

double x1, double x2, double x3);

//------------------------------------------------

// Ch08_02.cpp

//------------------------------------------------

#include <iostream>

#include <iomanip>

#include <limits>

#define _USE_MATH_DEFINES

#include <math.h>

#include "Ch08_02.h"

static void PackedCompareF32(void);

static void PackedCompareF64(void);

static const char* c_CmpStr[8] = { "EQ", "NE", "LT", "LE", "GT", "GE", "OD", "UO" };

int main()

{

PackedCompareF32();

PackedCompareF64();

return 0;

}

static void PackedCompareF32(void)

{

ZmmVal a, b;

uint16_t c[8];

const char nl = ' ';

constexpr float qnan_f32 = std::numeric_limits<float>::quiet_NaN();

a.m_F32[0] = 2.0f; b.m_F32[0] = 1.0f;

a.m_F32[1] = 7.0f; b.m_F32[1] = 12.0f;

a.m_F32[2] = -6.0f; b.m_F32[2] = -6.0f;

a.m_F32[3] = 3.0f; b.m_F32[3] = 8.0f;

a.m_F32[4] = -16.0f; b.m_F32[4] = -36.0f;

a.m_F32[5] = 3.5f; b.m_F32[5] = 3.5f;

a.m_F32[6] = (float)M_PI; b.m_F32[6] = -6.0f;

a.m_F32[7] = (float)M_SQRT2; b.m_F32[7] = qnan_f32;

a.m_F32[8] = 102.0f; b.m_F32[8] = (float)M_SQRT1_2;

a.m_F32[9] = 77.0f; b.m_F32[9] = 77.0f;

a.m_F32[10] = 187.0f; b.m_F32[10] = 33.0f;

a.m_F32[11] = -5.1f; b.m_F32[11] = -87.0f;

a.m_F32[12] = 16.0f; b.m_F32[12] = 936.0f;

a.m_F32[13] = 0.5f; b.m_F32[13] = 0.5f;

a.m_F32[14] = (float)(M_PI * 2); b.m_F32[14] = 66.6667f;

a.m_F32[15] = (float)(1.0/M_SQRT2); b.m_F32[15] = 100.7f;

PackedCompareF32_Iavx512(c, &a, &b);

const size_t w1 = 10;

const size_t w2 = 6;

std::cout << (" Results for PackedCompareF32 ");

std::cout << std::fixed << std::setprecision(4);

std::cout << " a b ";

for (unsigned int j = 0; j < 8; j++)

std::cout << std::setw(w2) << c_CmpStr[j];

std::cout << nl << std::string(70, '-') << nl;

for (unsigned int i = 0; i < 16; i++)

{

std::cout << std::setw(w1) << a.m_F32[i];

std::cout << std::setw(w1) << b.m_F32[i];

for (unsigned int j = 0; j < 8; j++)

std::cout << std::setw(w2) << ((c[j] & (1 << i)) ? 1 : 0);

std::cout << nl;

}

static void PackedCompareF64(void)

{

ZmmVal a, b, c;

const double x1 = -20.0, x2 = 20.0, x3 = 999.0;

const char nl = ' ';

a.m_F64[0] = 5.0; b.m_F64[0] = -1.0;

a.m_F64[1] = 21.0; b.m_F64[1] = 2.0;

a.m_F64[2] = 3.0; b.m_F64[2] = 14.0;

a.m_F64[3] = 4.0; b.m_F64[3] = -9.0;

a.m_F64[4] = 52.0; b.m_F64[4] = -5.0;

a.m_F64[5] = 6.0; b.m_F64[5] = 6.0;

a.m_F64[6] = 7.0; b.m_F64[6] = -37.0;

a.m_F64[7] = 8.0; b.m_F64[7] = -9.0;

PackedCompareF64_Iavx512(&c, &a, &b, x1, x2, x3);

const size_t w1 = 8;

const size_t w2 = 12;

std::cout << std::fixed << std::setprecision(4);

std::cout << (" Results for PackedCompareF64 ");

std::cout << "(x1 = " << std::setw(w1) << x1;

std::cout << " x2 = " << std::setw(w1) << x2 << ") ";

for (unsigned int i = 0; i < 8; i++)

{

std::cout << std::setw(w2) << a.m_F64[i];

std::cout << std::setw(w2) << b.m_F64[i];

std::cout << std::setw(w2) << c.m_F64[i];

std::cout << nl;

}

//------------------------------------------------

// Ch08_02_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch08_02.h"

void PackedCompareF32_Iavx512(uint16_t c[8], const ZmmVal* a, const ZmmVal* b)

{

__m512 a_vals = _mm512_load_ps(a);

__m512 b_vals = _mm512_load_ps(b);

// Perform packed F32 compares

c[0] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_EQ_OQ);

c[1] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_NEQ_OQ);

c[2] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_LT_OQ);

c[3] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_LE_OQ);

c[4] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_GT_OQ);

c[5] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_GE_OQ);

c[6] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_ORD_Q);

c[7] = _mm512_cmp_ps_mask(a_vals, b_vals, _CMP_UNORD_Q);

}

void PackedCompareF64_Iavx512(ZmmVal* c, const ZmmVal* a, const ZmmVal* b,

double x1, double x2, double x3)

{

__m512d a_vals = _mm512_load_pd(a);

__m512d b_vals = _mm512_load_pd(b);

__m512d x1_vals = _mm512_set1_pd(x1);

__m512d x2_vals = _mm512_set1_pd(x2);

__m512d x3_vals = _mm512_set1_pd(x3);

__mmask8 m;

m = _mm512_cmp_pd_mask(a_vals, x1_vals, _CMP_GE_OQ);

m &= _mm512_cmp_pd_mask(a_vals, x2_vals, _CMP_LE_OQ);

m &= _mm512_cmp_pd_mask(b_vals, x1_vals, _CMP_GE_OQ);

m &= _mm512_cmp_pd_mask(b_vals, x2_vals, _CMP_LE_OQ);

__m512d temp0 = _mm512_mask_mul_pd(x3_vals, m, a_vals, b_vals);

_mm512_store_pd(c, temp0);

}

Listing 8-2

Example Ch08_02

In Listing 8-2, the files Ch08_02.h and Ch08_02.cpp contain function declarations and test case initialization code for this source code example. The first function in file Ch08_02_fcpp.cpp, PackedCompareF32_Iavx512(), performs SIMD compare operations using packed single-precision floating-point operands. Unlike AVX and AVX2, AVX-512 SIMD floating-point compare operations return scalar integers that signify the results. In the current example, the C++ SIMD intrinsic function _mm512_cmp_ps_mask() returns an integer value of type __mmask16 . Each bit position of this 16-bit wide mask value reports the compare result for the corresponding SIMD operand element position (1 = compare predicate true, 0 = compare predicate false). Function _mm512_cmp_ps_mask() uses the same compare predicates that _mm256_cmp_ps() uses (see example Ch03_02).

The next function in file Ch08_02_fcpp.cpp is named PackedCompareF64_Iavx512(). This function demonstrates execution of the C++ SIMD intrinsic function _mm512_cmp_pd_mask(). Note that the mask value calculated by the four _mm512_cmp_pd_mask() calls is used in the ensuing call to _mm512_mask_mul_pd() , which carries out multiplication using only those element pairs whose values are greater than or equal to x1 and less than or equal to x2. Element pairs whose values do not meet these criteria are set to x3. This (somewhat contrived) computation illustrates how to perform merge masking using packed double-precision floating-point operands. Most AVX-512 floating-point C++ SIMD intrinsic functions support both merge masking and zero masking variants. Here are the results for source code example Ch08_02:

Results for PackedCompareF32

a b EQ NE LT LE GT GE OD UO

----------------------------------------------------------------------

2.0000 1.0000 0 1 0 0 1 1 1 0

7.0000 12.0000 0 1 1 1 0 0 1 0

-6.0000 -6.0000 1 0 0 1 0 1 1 0

3.0000 8.0000 0 1 1 1 0 0 1 0

-16.0000 -36.0000 0 1 0 0 1 1 1 0

3.5000 3.5000 1 0 0 1 0 1 1 0

3.1416 -6.0000 0 1 0 0 1 1 1 0

1.4142 nan 0 0 0 0 0 0 0 1

102.0000 0.7071 0 1 0 0 1 1 1 0

77.0000 77.0000 1 0 0 1 0 1 1 0

187.0000 33.0000 0 1 0 0 1 1 1 0

-5.1000 -87.0000 0 1 0 0 1 1 1 0

16.0000 936.0000 0 1 1 1 0 0 1 0

0.5000 0.5000 1 0 0 1 0 1 1 0

6.2832 66.6667 0 1 1 1 0 0 1 0

0.7071 100.7000 0 1 1 1 0 0 1 0

Results for PackedCompareF64 (x1 = -20.0000 x2 = 20.0000)

5.0000 -1.0000 -5.0000

21.0000 2.0000 999.0000

3.0000 14.0000 42.0000

4.0000 -9.0000 -36.0000

52.0000 -5.0000 999.0000

6.0000 6.0000 36.0000

7.0000 -37.0000 999.0000

8.0000 -9.0000 -72.0000

Floating-Point Arrays

In Chapter 3, you learned how to calculate the mean and standard deviation of an array of single-precision floating-point values using AVX (see example Ch03_04). The next source example, Ch08_03, explains how to perform these same calculations using AVX-512. Listing 8-3 shows the only mean and standard deviation calculating code for example Ch08_03. Not shown is the test case initialization and error checking code since it is almost identical to the code that you saw in example Ch03_04.

//------------------------------------------------

// Ch08_03_fcpp.cpp

//------------------------------------------------

#include <cmath>

#include <stdexcept>

#include <immintrin.h>

#include "Ch08_03.h"

#include "AlignedMem.h"

void CalcMeanF32_Cpp(float* mean, const float* x, size_t n)

{

if (!CheckArgs(x, n))

throw std::runtime_error("CalcMeanF32_Cpp() - CheckArgs failed");

float sum = 0.0f;

for (size_t i = 0; i < n; i++)

sum += x[i];

*mean = sum / n;

}

void CalcStDevF32_Cpp(float* st_dev, const float* x, size_t n, float mean)

{

if (!CheckArgs(x, n))

throw std::runtime_error("CalcStDevF32_Cpp() - CheckArgs failed");

float sum_squares = 0.0f;

for (size_t i = 0; i < n; i++)

{

float temp = x[i] - mean;

sum_squares += temp * temp;

}

*st_dev = sqrt(sum_squares / (n - 1));

}

void CalcMeanF32_Iavx512(float* mean, const float* x, size_t n)

{

if (!CheckArgs(x, n))

throw std::runtime_error("CalcMeanF32_Iavx512() - CheckArgs failed");

__m512 sums = _mm512_setzero_ps();

size_t i = 0;

const size_t num_simd_elements = 16;

for (; n - i >= num_simd_elements; i += num_simd_elements)

{

__m512 x_vals = _mm512_load_ps(&x[i]);

sums = _mm512_add_ps(x_vals, sums);

}

// Reduce packed sums to scalar value, then process remaining elements

float sum = _mm512_reduce_add_ps(sums);

for (; i < n; i++)

sum += x[i];

*mean = sum / n;

}

void CalcStDevF32_Iavx512(float* st_dev, const float* x, size_t n, float mean)

{

if (!CheckArgs(x, n))

throw std::runtime_error("CalcStDevF32_Iavx512() - CheckArgs failed");

__m512 packed_mean = _mm512_set1_ps(mean);

__m512 packed_sum_squares = _mm512_setzero_ps();

size_t i = 0;

const size_t num_simd_elements = 16;

for (; n - i >= num_simd_elements; i += num_simd_elements)

{

__m512 x_vals = _mm512_load_ps(&x[i]);

__m512 temp1 = _mm512_sub_ps(x_vals, packed_mean);

packed_sum_squares = _mm512_fmadd_ps(temp1, temp1, packed_sum_squares);

}

// Reduce packed_sum_squares to scalar value, then process remaining elements

float sum_squares =_mm512_reduce_add_ps(packed_sum_squares);

for (; i < n; i++)

{

float temp1 = x[i] - mean;

sum_squares += temp1 * temp1;

}

*st_dev = sqrt(sum_squares / (n - 1));

}

Listing 8-3

Example Ch08_03

The first two functions in file Ch08_03_fcpp.cpp, CalcMeanF32_Cpp() and CalcStDevF32_Cpp(), calculate the mean and standard deviation using standard C++ statements. These functions are included in this example for comparison purposes. The next function, CalcMeanF32_Iavx512(), calculates the array mean using AVX-512 C++ SIMD intrinsic functions. Following argument validation, function CalcMeanF32_Iavx512() uses _mm512_setzero_ps() to initialize sums to zero. The variable sums contains 16 intermediate single-precision floating-point sum values. These values are updated during each iteration of the ensuing for-loop. Following execution of the for-loop, CalcMeanF32_Iavx512() uses the C++ SIMD intrinsic function _mm512_reduce_add_ps() to reduce the 16 single-precision floating-point values in sums to a single scalar value. Recall that the AVX code in example Ch03_04 employed a sequence of C++ SIMD intrinsic function calls to perform this same reduction. Following the reduction of sums, the second for-loop in CalcMeanF32_Iavx512() processes any residual elements using scalar arithmetic.

The standard deviation calculating code in function CalcStDevF32_Iavx512() uses the C++ SIMD intrinsic function _mm512_set1_ps() to initialize packed_means. This is followed by a call to _mm512_setzero_ps(), which initializes packed_sum_squares to zero. During each iteration, the first for-loop in CalcStDevF32_Iavx512() updates this value using 16 elements from array x. Following execution of the for-loop, packed_sum_squares is reduced to a scalar value using the C++ SIMD intrinsic function _mm512_reduce_add_ps(). The second for-loop in CalcStDevF32_Iavx512() processes any residual elements using scalar arithmetic. Here are the results for source code example Ch08_03:

Results for CalcMeanF32_Cpp and CalcStDevF32_Cpp

mean1: 49.602146 st_dev1: 27.758242

Results for CalcMeanF32_Iavx512 and CalcStDevF32_Iavx512

mean2: 49.602158 st_dev2: 27.758244

Like example Ch03_04, the results for source code example Ch08_03 contain some slight discrepancy values due to the non-associativity of floating-point arithmetic. Whether these discrepancies are of any consequence depends on the specific application.

Floating-Point Matrices

In Chapter 5, you studied several source code examples that explained how to perform common matrix operations using AVX2 C++ SIMD intrinsic functions. In this section, you will learn how to carry out some of the same matrix operations using AVX-512 C++ SIMD intrinsic functions. The first source code example highlights the use of AVX-512 to calculate a covariance matrix. This is followed by two source code examples that spotlight matrix multiplication. The final source code example of this section explicates matrix-vector multiplication. As you will soon see, it is often a straightforward programming task to adapt an algorithm originally written using AVX2 C++ SIMD intrinsic functions to one that exploits the computational resources of AVX-512.

Covariance Matrix

Mathematicians often use a statistical measure called covariance to quantify the extent to which two random variables vary together. When multiple random variables are being analyzed, it is common to calculate a matrix of all possible covariances. This matrix is called, unsurprisingly, a covariance matrix. Once calculated, a covariance matrix can be employed to perform a wide variety of advanced statistical analyses. Appendix B contains several references that you can consult if you are interested in learning more about covariance and covariance matrices.

The calculation of a covariance matrix begins with a sample data matrix as shown in Figure 8-1. In this figure, each row of matrix X represents one random variable (or feature). Each column in X is a multivariate observation. The elements c_ij in covariance matrix C are calculated using the following equation:

${c}_{ij}=frac{sum limits_{k=0}^{n_{obv}-1}left({x}_{ik}-{overline{x}}_i ight)left({x}_{jk}-{overline{x}}_j ight)}{n_{obv}-1}$

where i = 0, 1, ⋯, n_var − 1 and j = 0, 1, ⋯, n_var − 1. In these equations, the symbols n_obv and n_var signify the number of observations and variables, respectively. A covariance matrix is always a square (n_var × n_var) symmetric (c_ij = c_ji) matrix as shown in Figure 8-1. Each covariance matrix element c_ij represents the covariance between random variables x_i and x_j, and each main diagonal element c_ii is the variance for variable x_i.

Figure 8-1
Example data matrix and covariance matrix

Listing 8-4 shows the source code for example Ch08_04. This example demonstrates how to calculate a covariance matrix using AVX-512 SIMD arithmetic.

//------------------------------------------------

// Ch08_04.h

//------------------------------------------------

#pragma once

#include <vector>

#include "MatrixF64.h"

// Note: In this example, CMD stands for covariance matrix data

struct CMD

{

MatrixF64 m_X; // Data matrix

MatrixF64 m_CovMat; // Covariance matrix

std::vector<double> m_VarMeans; // Variable (row) means

CMD(size_t n_vars, size_t n_obvs) :

m_X(n_vars, n_obvs), m_CovMat(n_vars, n_vars), m_VarMeans(n_vars) { }

};

// Ch08_04_fcpp.cpp

extern void CalcCovMatF64_Cpp(CMD& cmd);

extern void CalcCovMatF64_Iavx512(CMD& cmd);

// Ch08_04_misc.cpp

extern bool CheckArgs(const CMD& cmd);

extern bool CompareResults(CMD& cmd1, CMD& cmd2);

extern void InitCMD(CMD& cmd1, CMD& cmd2);

// Ch08_04_misc2.cpp

extern void DisplayData(const CMD& cmd);

//------------------------------------------------

// Ch08_04_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch08_04.h"

void CalcCovMatF64_Cpp(CMD& cmd)

{

if (!CheckArgs(cmd))

throw std::runtime_error("CalcCovMatrixF64_Cpp() - CheckArgs failed");

size_t n_vars = cmd.m_X.GetNumRows();

size_t n_obvs = cmd.m_X.GetNumCols();

double* cov_mat = cmd.m_CovMat.Data();

double* x = cmd.m_X.Data();

double* var_means = cmd.m_VarMeans.data();

// Calculate variable means (rows of cmd.m_X)

for (size_t i = 0; i < n_vars; i++)

{

var_means[i] = 0.0;

for (size_t j = 0; j < n_obvs; j++)

var_means[i] += x[i * n_obvs + j];

var_means[i] /= n_obvs;

}

// Calculate covariance matrix

for (size_t i = 0; i < n_vars; i++)

{

for (size_t j = 0; j < n_vars; j++)

{

if (i <= j)

{

double sum = 0.0;

for (size_t k = 0; k < n_obvs; k++)

{

double temp1 = x[i * n_obvs + k] - var_means[i];

double temp2 = x[j * n_obvs + k] - var_means[j];

sum += temp1 * temp2;

}

cov_mat[i * n_vars + j] = sum / (n_obvs - 1);

}

else

cov_mat[i * n_vars + j] = cov_mat[j * n_vars + i];

}

inline double ReduceAddF64(__m512d a, __m256d b, __m128d c)

{

// Sum double precision values of arguments a, b, and c

__m256d temp0 = _mm256_setzero_pd();

__m256d temp1 = _mm256_insertf64x2(temp0, c, 0);

__m256d temp2 = _mm256_add_pd(temp1, b);

__m512d temp3 = _mm512_setzero_pd();

__m512d temp4 = _mm512_insertf64x4(temp3, temp2, 0);

__m512d temp5 = _mm512_add_pd(temp4, a);

return _mm512_reduce_add_pd(temp5);

}

void CalcCovMatF64_Iavx512(CMD& cmd)

{

if (!CheckArgs(cmd))

throw std::runtime_error("CalcCovMatrixF64_Iavx512() - CheckArgs failed");

size_t n_vars = cmd.m_X.GetNumRows();

size_t n_obvs = cmd.m_X.GetNumCols();

double* cov_mat = cmd.m_CovMat.Data();

double* x = cmd.m_X.Data();

double* var_means = cmd.m_VarMeans.data();

const size_t num_simd_elements8 = 8;

const size_t num_simd_elements4 = 4;

const size_t num_simd_elements2 = 2;

// Calculate variable means (rows of cmd.m_X)

for (size_t i = 0; i < n_vars; i++)

{

size_t j = 0;

__m512d sums_512 = _mm512_setzero_pd();

__m256d sums_256 = _mm256_setzero_pd();

__m128d sums_128 = _mm_setzero_pd();

while (j < n_obvs)

{

if (n_obvs - j >= num_simd_elements8)

{

__m512d x_vals = _mm512_loadu_pd(&x[i * n_obvs + j]);

sums_512 = _mm512_add_pd(x_vals, sums_512);

j += num_simd_elements8;

}

else if (n_obvs - j >= num_simd_elements4)

{

__m256d x_vals = _mm256_loadu_pd(&x[i * n_obvs + j]);

sums_256 = _mm256_add_pd(x_vals, sums_256);

j += num_simd_elements4;

}

else if (n_obvs - j >= num_simd_elements2)

{

__m128d x_vals = _mm_loadu_pd(&x[i * n_obvs + j]);

sums_128 = _mm_add_pd(x_vals, sums_128);

j += num_simd_elements2;

}

else

{

__m128d x_val = _mm_load_sd(&x[i * n_obvs + j]);

sums_128 = _mm_add_pd(x_val, sums_128);

j += 1;

}

double sum = ReduceAddF64(sums_512, sums_256, sums_128);

var_means[i] = sum / n_obvs;

}

// Calculate covariance matrix

for (size_t i = 0; i < n_vars; i++)

{

for (size_t j = 0; j < n_vars; j++)

{

if (i <= j)

{

size_t k = 0;

__m512d sums_512 = _mm512_setzero_pd();

__m256d sums_256 = _mm256_setzero_pd();

__m128d sums_128 = _mm_setzero_pd();

while (k < n_obvs)

{

if (n_obvs - k >= num_simd_elements8)

{

__m512d var_means1_512 = _mm512_set1_pd(var_means[i]);

__m512d var_means2_512 = _mm512_set1_pd(var_means[j]);

__m512d x_vals1 = _mm512_loadu_pd(&x[i * n_obvs + k]);

__m512d x_vals2 = _mm512_loadu_pd(&x[j * n_obvs + k]);

__m512d temp1 = _mm512_sub_pd(x_vals1, var_means1_512);

__m512d temp2 = _mm512_sub_pd(x_vals2, var_means2_512);

sums_512 = _mm512_fmadd_pd(temp1, temp2, sums_512);

k += num_simd_elements8;

}

else if (n_obvs - k >= num_simd_elements4)

{

__m256d var_means1_256 = _mm256_set1_pd(var_means[i]);

__m256d var_means2_256 = _mm256_set1_pd(var_means[j]);

__m256d x_vals1 = _mm256_loadu_pd(&x[i * n_obvs + k]);

__m256d x_vals2 = _mm256_loadu_pd(&x[j * n_obvs + k]);

__m256d temp1 = _mm256_sub_pd(x_vals1, var_means1_256);

__m256d temp2 = _mm256_sub_pd(x_vals2, var_means2_256);

sums_256 = _mm256_fmadd_pd(temp1, temp2, sums_256);

k += num_simd_elements4;

}

else if (n_obvs - k >= num_simd_elements2)

{

__m128d var_means1_128 = _mm_set1_pd(var_means[i]);

__m128d var_means2_128 = _mm_set1_pd(var_means[j]);

__m128d x_vals1 = _mm_loadu_pd(&x[i * n_obvs + k]);

__m128d x_vals2 = _mm_loadu_pd(&x[j * n_obvs + k]);

__m128d temp1 = _mm_sub_pd(x_vals1, var_means1_128);

__m128d temp2 = _mm_sub_pd(x_vals2, var_means2_128);

sums_128 = _mm_fmadd_pd(temp1, temp2, sums_128);

k += num_simd_elements2;

}

else

{

__m128d var_means1_64 = _mm_load_sd(&var_means[i]);

__m128d var_means2_64 = _mm_load_sd(&var_means[j]);

__m128d x_vals1 = _mm_load_sd(&x[i * n_obvs + k]);

__m128d x_vals2 = _mm_load_sd(&x[j * n_obvs + k]);

__m128d temp1 = _mm_sub_pd(x_vals1, var_means1_64);

__m128d temp2 = _mm_sub_pd(x_vals2, var_means2_64);

sums_128 = _mm_fmadd_pd(temp1, temp2, sums_128);

k += 1;

}

double sum = ReduceAddF64(sums_512, sums_256, sums_128);

cov_mat[i * n_vars + j] = sum / (n_obvs - 1);

}

else

cov_mat[i * n_vars + j] = cov_mat[j * n_vars + i];

}

Listing 8-4

Example Ch08_04

Near the top of Listing 8-4 is the file Ch08_04.h, which begins with the definition of structure CMD (CMD = covariance matrix data). This structure contains the data matrix, the variable means vector, and the covariance matrix. Note that CMD also includes a simple constructor that allocates space for the three container objects using the specified n_vars and n_obvs. The source code that performs argument validation, test data initialization, and result comparisons is not shown in Listing 8-4 but included in the download software package.

The core calculating functions of source code example are in Ch08_08_fcpp.cpp, which begins with the definition of function CalcCovMatF64_Cpp(). This function uses standard C++ statements to calculate the covariance matrix and is included for comparison purposes. The code in CalcCovMatF64_Cpp() is split into two major sections. The first section calculates the mean for each variable (or row) in data matrix x. The second section calculates the covariances. Note that function CalcCovMatF64_Cpp() exploits the fact that a covariance matrix is symmetric and only carries out a complete calculation when i <= j is true. If i <= j is false, CalcCovMatF64_Cpp() executes cov_mat[i][j] = cov_mat[j][j].

The next function in Ch08_04_fcpp.cpp is a SIMD inline function named ReduceAddF64(). This function reduces the double-precision floating-point elements of arguments a (__m512d), b (__m256d), and c (__m128d) to a scalar double-precision value. Note that ReduceAddF64() employs several C++ SIMD intrinsic functions to size-extend argument values b and c to packed 512-bit wide SIMD values. Doing this facilitates the use of the AVX-512 C++ SIMD intrinsic function _mm512_reduce_add_pd() to perform the reduction.

The final function in Listing 8-4 is named CalcCovMatF64_Iavx512(). Like its standard C++ counterpart, function CalcCovMatF64_Iavx512() uses distinct sections of code to calculate the variable means and the covariance matrix. The mean calculating while-loop employs __m512d, __m256d, __m128d, or scalar objects to perform its computations. Note that each if section verifies that enough elements are available in the current row before carrying out any SIMD calculations. Following the while-loop, CalcCovMatF64_Iavx512() invokes ReduceAddF64() to reduce sums_512, sums_256, and sums_128 to a scalar value. It then calculates var_means[i].

Function CalcCovMatF64_Iavx512() uses a similar while-loop construct to calculate the elements of the covariance matrix. Like the function CalcCovMatF64_Cpp(), function CalcCovMatF64_Iavx512() calculates a covariance matrix element only if i <= j is true; otherwise, a covariance matrix element copy is performed. Note that the four covariance-matrix-element calculating code blocks in CalcCovMatF64_Iavx512() employ FMA arithmetic to carry out their computations. Here are the results for source code example Ch08_04:

Results for CalcCovMatF64

n_vars = 12, n_obvs = 111

Variable means

0: 13.37 13.37

1: 12.34 12.34

2: 12.62 12.62

3: 13.25 13.25

4: 11.89 11.89

5: 12.10 12.10

6: 11.65 11.65

7: 12.13 12.13

8: 12.49 12.49

9: 12.53 12.53

10: 12.08 12.08

11: 12.33 12.33

cmd1.m_CovMat

50.61 2.00 0.60 1.41 -1.57 3.15 -2.68 4.36 -0.70 3.04 5.16 -4.16

2.00 48.45 -6.96 2.20 5.23 1.45 0.62 -4.88 -0.52 9.59 1.86 -1.76

0.60 -6.96 48.26 -13.01 1.15 -2.10 5.47 -5.81 3.22 0.46 -0.75 7.50

1.41 2.20 -13.01 51.71 2.05 1.76 -5.01 7.45 -2.03 4.34 -5.41 7.26

-1.57 5.23 1.15 2.05 55.88 -6.90 -7.11 0.17 4.78 4.57 -0.03 -3.01

3.15 1.45 -2.10 1.76 -6.90 54.28 10.10 4.07 0.17 -0.99 1.24 3.15

-2.68 0.62 5.47 -5.01 -7.11 10.10 56.98 -4.95 -2.19 -2.47 -10.59 1.63

4.36 -4.88 -5.81 7.45 0.17 4.07 -4.95 52.51 3.89 1.76 4.04 1.80

-0.70 -0.52 3.22 -2.03 4.78 0.17 -2.19 3.89 52.56 3.16 -2.82 5.82

3.04 9.59 0.46 4.34 4.57 -0.99 -2.47 1.76 3.16 42.41 8.35 1.28

5.16 1.86 -0.75 -5.41 -0.03 1.24 -10.59 4.04 -2.82 8.35 58.25 -11.63

-4.16 -1.76 7.50 7.26 -3.01 3.15 1.63 1.80 5.82 1.28 -11.63 49.79

cmd2.m_CovMat