Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

Daniel KusswurmModern X86 Assembly Language Programminghttps://doi.org/10.1007/978-1-4842-4063-2_13

13. AVX-512 Programming – Floating-Point

Daniel Kusswurm¹

(1)

Geneva, IL, USA

In previous chapters, you learned how to carry out scalar and packed floating-point operations using the AVX and AVX2 instruction sets. In this chapter, you learn how to perform these operations using the AVX-512 instruction set. The first part of this chapter contains source code examples that illustrate basic AVX-512 programming concepts using scalar floating-point operands. This includes examples that illustrate conditional executions, merge and zero masking , and instruction-level rounding. The second part of this chapter demonstrates how to use the AVX-512 instruction set to carry out packed floating-point calculations using 512-bit wide operands and the ZMM register set.

The source code examples of this chapter require a processor and operating system that support AVX-512 and the following instruction set extensions: AVX512F , AVX512CD , AVX512BW , AVX512DQ , and AVX512VL . As discussed in Chapter 12, these extensions are supported by processors that are based on the Intel Skylake Server microarchitecture. Future processors from both AMD and Intel are also likely to incorporate the previously-mentioned instruction set extensions. You can use one of the freely available utilities listed in Appendix A to determine which AVX-512 instruction sets your system supports. In Chapter 16, you learn how to use the cupid instruction to detect specific AVX-512 instruction set extensions at runtime.

Scalar Floating-Point

AVX-512 extends the scalar floating-point capabilities of AVX to include merge masking , zero masking , and instruction-level rounding control. The source code examples of this section explain how to use these capabilities. They also exemplify some minor differences that you need to be aware of when writing scalar float-point code using AVX-512 instructions.

Merge Masking

Listing 13-1 shows the source code for example Ch13_01. This example describes how to perform merge masking using AVX-512 scalar floating-point instructions. It also illustrates the use of several opmask register instructions.

//------------------------------------------------

// Ch13_01.cpp

//------------------------------------------------

#include "stdafx.h"

#include <string>

#include <iostream>

#include <iomanip>

#include <limits>

#define _USE_MATH_DEFINES

#include <math.h>

using namespace std;

extern "C" double g_PI = M_PI;

extern "C" bool Avx512CalcSphereAreaVol_(double* sa, double* vol, double radius, double error_val);

bool Avx512CalcSphereAreaVolCpp(double* sa, double* vol, double radius, double error_val)

{

bool rc;

if (radius < 0.0)

{

*sa = error_val;

*vol = error_val;

rc = false;

}

else

{

*sa = 4.0 * g_PI * radius * radius;

*vol = *sa * radius / 3.0;

rc = true;

}

return rc;

}

int main()

{

const double error_val = numeric_limits<double>::quiet_NaN();

const double radii[] = {-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, -7.0, 10.0, -18.0, 20.0};

int num_r = sizeof(radii) / sizeof(double);

string sp {" "};

string sep(75, '-');

cout << setw(10) << "radius" << sp;

cout << setw(6) << "rc1" << sp;

cout << setw(6) << "rc2" << sp;

cout << setw(10) << "sa1" << sp;

cout << setw(10) << "sa2" << sp;

cout << setw(10) << "vol1" << sp;

cout << setw(10) << "vol2" << ' ';

cout << sep << ' ';

cout << fixed << setprecision(4);

for (int i = 0; i < num_r; i++)

{

double sa1, sa2;

double vol1, vol2;

double r = radii[i];

bool rc1 = Avx512CalcSphereAreaVolCpp(&sa1, &vol1, r, error_val);

bool rc2 = Avx512CalcSphereAreaVol_(&sa2, &vol2, r, error_val);

cout << setw(10) << r << sp;

cout << setw(6) << boolalpha << rc1 << sp;

cout << setw(6) << boolalpha << rc2 << sp;

cout << setw(10) << sa1 << sp;

cout << setw(10) << sa2 << sp;

cout << setw(10) << vol1 << sp;

cout << setw(10) << vol2 << ' ';

}

return 0;

}

;-------------------------------------------------

; Ch13_01.asm

;-------------------------------------------------

include <cmpequ.asmh>

.const

r8_three real8 3.0

r8_four real8 4.0

extern g_PI:real8

; extern "C" bool Avx512CalcSphereAreaVol_(double* sa, double* v, double r, double error_val);

;

; Returns: false = invalid radius, true = valid radius

.code

Avx512CalcSphereAreaVol_ proc

; Test radius for value >= 0.0

vmovsd xmm0,xmm0,xmm2 ;xmm0 = radius

vxorpd xmm5,xmm5,xmm5 ;xmm5 = 0.0

vmovsd xmm16,xmm16,xmm3 ;xmm16 = error_val

vcmpsd k1,xmm0,xmm5,CMP_GE ;k1[0] = 1 if radius >= 0.0

; Calculate surface area and volume using mask from compare

vmulsd xmm1{k1},xmm0,xmm0 ;xmm1 = r * r

vmulsd xmm2{k1},xmm1,[r8_four] ;xmm2 = 4 * r * r

vmulsd xmm3{k1},xmm2,[g_PI] ;xmm3 = 4 * PI * r * r (sa)

vmulsd xmm4{k1},xmm3,xmm0 ;xmm4 = 4 * PI * r * r * r

vdivsd xmm5{k1},xmm4,[r8_three] ;xmm5 = 4 * PI * r * r * r / 3 (vol)

; Set surface area and volume to error_val if radius < 0.0 is true

knotw k2,k1 ;k2[0] = 1 if radius < 0.0

vmovsd xmm3{k2},xmm3,xmm16 ;xmm3 = error_val if radius < 0.0

vmovsd xmm5{k2},xmm5,xmm16 ;xmm5 = error_val if radius < 0.0

; Save results

vmovsd real8 ptr [rcx],xmm3 ;save surface area

vmovsd real8 ptr [rdx],xmm5 ;save volume

kmovw eax,k1 ;eax = return code

ret

Avx512CalcSphereAreaVol_ endp

end

Listing 13-1.

Example Ch13_01

The C++ code in Listing 13-1 starts with the function Avx512CalcSphereAreaVolCpp. This function calculates the surface area and volume of any sphere whose radius is greater or equal to zero. If the sphere’s radius is less than zero, Avx512CalcSphereAreaVolCpp sets the surface area and volume to error_val. The remaining C++ code in Listing 13-1 performs test case initialization, exercises the functions Avx512CalcSphereAreaVolumeCpp and Avx512CalcSphereAreaVolume_, and streams results to cout.

The assembly language function Avx512CalcSphereAreaVol_ implements the same algorithm as its C++ counterpart. This function begins with a vmovsd xmm0,xmm0,xmm2 instruction that copies argument value r to register XMM0. It then loads register XMM5 with 0.0. The vmovsd xmm16,xmm16,xmm3 instruction copies error_val into register XMM16. According to the Visual C++ calling convention, the new AVX-512 registers ZMM16–ZMM31 along with the low-order YMM and XMM counterparts are volatile across function boundaries. This means that these registers can be used by any assembly language function without preserving their values. The next instruction, vcmpsd k1,xmm0,xmm5,CMP_GE, sets opmask register bit K1[0] to one if r is greater than or equal to zero; otherwise, this bit is set to zero.

The first instruction of the surface area and volume calculation code block, vmulsd xmm1{k1},xmm0,xmm0, computes r * r if bit K1[0] is set to one (r >= 0.0 is true); it then saves the calculated product in XMM1[63:0]. If bit K1[0] is set to zero (r < 0.0 is true), the processor skips the double-precision floating-point multiplication calculation and leaves register XMM1 unaltered. The next instruction, vmulsd xmm2{k1},xmm1,[r8_four], computes 4.0 * r * r using the same merge masking operation as the previous instruction. The ensuing vmulsd and vdivsd instructions complete the required surface area (XMM3) and volume (XMM5) calculations. The merge masking operations in this code block exemplify one of AVX-512’s key computational capabilities: the processor carries out the double-precision floating-point arithmetic calculations only if bit K1[0] is set to one; otherwise no calculations are performed, and the respective destination operand registers remain unchanged.

Following the surface area and volume calculations, the knotw k2,k1 negates the low-order 16 bits of K1 and saves this result to K2[15:0]. This instruction also sets bits K2[63:16] to zero. Bit K2[0] is now set to one if r < 0.0 is true. The knotw instruction is used here since it’s part of AVX512F instruction set extension; knot[b|d|q] would also work here. The next instruction, vmovsd xmm3{k2},xmm3,xmm16, sets the surface area to error_val if r < 0.0 is true. The subsequent vmovsd xmm5{k2},xmm5,xmm16 instruction performs the same operation for the volume value. The final kmovw eax,k1 instruction loads EAX with the function return code. Here are the results for source code example Ch13_01:

radius rc1 rc2 sa1 sa2 vol1 vol2

---------------------------------------------------------------------------

-1.0000 false false nan nan nan nan

0.0000 true true 0.0000 0.0000 0.0000 0.0000

1.0000 true true 12.5664 12.5664 4.1888 4.1888

2.0000 true true 50.2655 50.2655 33.5103 33.5103

3.0000 true true 113.0973 113.0973 113.0973 113.0973

4.0000 true true 201.0619 201.0619 268.0826 268.0826

-7.0000 false false nan nan nan nan

10.0000 true true 1256.6371 1256.6371 4188.7902 4188.7902

-18.0000 false false nan nan nan nan

20.0000 true true 5026.5482 5026.5482 33510.3216 33510.3216

Zero Masking

The next source code example is named Ch13_02. This example demonstrates how to use zero masking to eliminate data-dependent conditional jumps from a calculation. Listing 13-2 shows the source code for this example.

//------------------------------------------------

// Ch13_02.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#include <array>

#include <random>

using namespace std;

extern "C" bool Avx512CalcValues_(double* c, const double* a, const double* b, size_t n);

template<typename T> void Init(T* x, size_t n, unsigned int seed)

{

uniform_int_distribution<> ui_dist {1, 200};

default_random_engine rng {seed};

for (size_t i = 0; i < n; i++)

x[i] = (T)(ui_dist(rng) - 25);

}

bool Avx512CalcValuesCpp(double* c, const double* a, const double* b, size_t n)

{

if (n == 0)

return false;

for (size_t i = 0; i < n; i++)

{

double val = a[i] * b[i];

c[i] = (val >= 0.0) ? sqrt(val) : val * val;

}

return true;

}

int main()

{

const size_t n = 20;

array<double, n> a;

array<double, n> b;

array<double, n> c1;

array<double, n> c2;

Init<double>(a.data(), n, 13);

Init<double>(b.data(), n, 23);

bool rc1 = Avx512CalcValuesCpp(c1.data(), a.data(), b.data(), n);

bool rc2 = Avx512CalcValues_(c2.data(), a.data(), b.data(), n);

if (!rc1 || !rc2)

{

cout << "Invalid return code - ";

cout << "rc1 = " << boolalpha << rc1 << " ";

cout << "rc2 = " << boolalpha << rc2 << ' ';

}

else

{

cout << fixed << setprecision(4);

for (size_t i = 0; i < n; i++)

{

cout << "i: " << setw(2) << i << " ";

cout << "a: " << setw(9) << a[i] << " ";

cout << "b: " << setw(9) << b[i] << " ";

cout << "c1: " << setw(13) << c1[i] << " ";

cout << "c2: " << setw(13) << c2[i] << " ";

}

;-------------------------------------------------

; Ch13_02.asm

;-------------------------------------------------

include <cmpequ.asmh>

; extern "C" bool Avx512CalcValues_(double* c, const double* a, const double* b, size_t n);

.code

Avx512CalcValues_ proc

; Validate n and initialize array index i

xor eax,eax ;set error return code (also i = 0)

test r9,r9 ;is n == 0?

jz Done ;jump if n is zero

vxorpd xmm5,xmm5,xmm5 ;xmm5 = 0.0

; Load next a[i] and b[i], calculate val

@@: vmovsd xmm0,real8 ptr [rdx+rax*8] ;xmm0 = a[i];

vmovsd xmm1,real8 ptr [r8+rax*8] ;xmm1 = b[i];

vmulsd xmm2,xmm0,xmm1 ;val = a[i] * b[i]

; Calculate c[i] = (val >= 0.0) ? sqrt(val) : val * val

vcmpsd k1,xmm2,xmm5,CMP_GE ;k1[0] = 1 if val >= 0.0

vsqrtsd xmm3{k1}{z},xmm3,xmm2 ;xmm3 = (val > 0.0) ? sqrt(val) : 0.0

knotw k2,k1 ;k2[0] = 1 if val < 0.0

vmulsd xmm4{k2}{z},xmm2,xmm2 ;xmm4 = (val < 0.0) ? val * val : 0.0

vorpd xmm0,xmm4,xmm3 ;xmm0 = (val >= 0.0) ? sqrt(val) : val * val

vmovsd real8 ptr [rcx+rax*8],xmm0 ;save result to c[i]

; Update index i and repeat until done

inc rax ;i += 1

cmp rax,r9

jl @B

mov eax,1 ;set success return code

Done: ret

Avx512CalcValues_ endp

end

Listing 13-2.

Example Ch13_02

In the C++ code, the function Avx512CalcValuesCpp performs a simple arithmetic calculation using double-precision floating-point arrays. Each loop iteration begins by calculating the intermediate value val = a[i] * b[i]. The next statement, c[i] = (val >= 0.0) ? sqrt(val) : val * val, loads c[i] with a quantity that varies depending on the value of val. The assembly language function Avx512CalcValues_ also performs the same computation. The C++ function main contains code that initializes the test arrays, exercises the functions Avx512CalcValuesCpp and Avx512CalcValues_, and displays the results.

The processing loop of Avx512CalcValues_ begins with two vmovsd instructions that load a[i] and b[i] into registers XMM0 and XMM1, respectively. The ensuing vmulsd xmm2,xmm0,xmm1 instruction computes the intermediate product val = a[i] * b[i]. Following the calculation of val, the vcmpsd k1,xmm2,xmm5,CMP_GE instruction compares val against 0.0 and sets bit K1[0] to one if val is greater than or equal to zero; otherwise bit K1[0] is set to zero. The next instruction, vsqrtsd xmm3{k1}{z},xmm3,xmm2, calculates the square root of val if K1[0] is set to one and saves the result in XMM3. If K1[0] is zero, the processor skips the square root calculation and sets register XMM3 to 0.0.

The knotw k2,k1 instruction sets K2[0] to one if val is less than 0.0. The ensuing vmulsd xmm4{k2}{z},xmm2,xmm2 instruction calculates and saves the product val * val in XMM4 if bit K2[0] is set to one; otherwise XMM4 is set equal to 0.0. Following execution of the vmulsd instruction, register XMM3 contains sqrt(val) and XMM4 contains 0.0, or XMM3 contains 0.0 and XMM4 holds val * val. These register values facilitate use of a vorpd xmm0,xmm4,xmm3 instruction to load XMM0 with the final value for c[i]. Like the previous source code example, the function Avx512CalcValues_ demonstrates an important capability of AVX-512. The use of zero masking and some simple Boolean logic allowed Avx512CalcValues_ to make logical decisions sans any conditional jump instructions. This is noteworthy since data-dependent conditional jump instructions are often slower than straight-line code. Here is the output for source code example Ch13_02:

i: 0 a: -6.0000 b: 67.0000 c1: 161604.0000 c2: 161604.0000

i: 1 a: 128.0000 b: 22.0000 c1: 53.0660 c2: 53.0660

i: 2 a: 130.0000 b: -8.0000 c1: 1081600.0000 c2: 1081600.0000

i: 3 a: 152.0000 b: 73.0000 c1: 105.3376 c2: 105.3376

i: 4 a: 94.0000 b: 6.0000 c1: 23.7487 c2: 23.7487

i: 5 a: 2.0000 b: 88.0000 c1: 13.2665 c2: 13.2665

i: 6 a: 12.0000 b: 103.0000 c1: 35.1568 c2: 35.1568

i: 7 a: 105.0000 b: 117.0000 c1: 110.8377 c2: 110.8377

i: 8 a: 140.0000 b: -20.0000 c1: 7840000.0000 c2: 7840000.0000

i: 9 a: 74.0000 b: 3.0000 c1: 14.8997 c2: 14.8997

i: 10 a: 43.0000 b: -9.0000 c1: 149769.0000 c2: 149769.0000

i: 11 a: 2.0000 b: 122.0000 c1: 15.6205 c2: 15.6205

i: 12 a: 36.0000 b: 9.0000 c1: 18.0000 c2: 18.0000

i: 13 a: -18.0000 b: 123.0000 c1: 4901796.0000 c2: 4901796.0000

i: 14 a: 170.0000 b: 134.0000 c1: 150.9304 c2: 150.9304

i: 15 a: 102.0000 b: 3.0000 c1: 17.4929 c2: 17.4929

i: 16 a: 118.0000 b: -19.0000 c1: 5026564.0000 c2: 5026564.0000

i: 17 a: 85.0000 b: 148.0000 c1: 112.1606 c2: 112.1606

i: 18 a: 61.0000 b: 65.0000 c1: 62.9682 c2: 62.9682

i: 19 a: 18.0000 b: 74.0000 c1: 36.4966 c2: 36.4966

Instruction-Level Rounding

The final source code example of this section, Ch13_03, explains how to use instruction-level rounding operands. It also illustrates use of the AVX-512 instructions that perform conversions between floating-point and unsigned integer values. Listing 13-3 shows the source code for example Ch13_03.

//------------------------------------------------

// Ch13_03.cpp

//------------------------------------------------

#include "stdafx.h"

#include <cstdint>

#include <iostream>

#include <iomanip>

#define _USE_MATH_DEFINES

#include <math.h>

using namespace std;

extern "C" void Avx512CvtF32ToU32_(uint32_t val_cvt[4], float val);

extern "C" void Avx512CvtF64ToU64_(uint64_t val_cvt[4], double val);

extern "C" void Avx512CvtF64ToF32_(float val_cvt[4], double val);

void ConvertF32ToU32(void)

{

uint32_t val_cvt[4];

const float val[] {(float)M_PI, (float)M_SQRT2};

const int num_vals = sizeof(val) / sizeof(float);

cout << " ConvertF32ToU32 ";

for (int i = 0; i < num_vals; i++)

{

Avx512CvtF32ToU32_(val_cvt, val[i]);

cout << " Test case #" << i << " val = " << val[i] << ' ';

cout << " val_cvt[0] {rn-sae} = " << val_cvt[0] << ' ';

cout << " val_cvt[1] {rd-sae} = " << val_cvt[1] << ' ';

cout << " val_cvt[2] {ru-sae} = " << val_cvt[2] << ' ';

cout << " val_cvt[3] {rz-sae} = " << val_cvt[3] << ' ';

}

void ConvertF64ToU64(void)

{

uint64_t val_cvt[4];

const double val[] {(float)M_PI, (float)M_SQRT2};

const int num_vals = sizeof(val) / sizeof(double);

cout << " ConvertF64ToU64 ";

for (int i = 0; i < num_vals; i++)

{

Avx512CvtF64ToU64_(val_cvt, val[i]);

cout << " Test case #" << i << " val = " << val[i] << ' ';

cout << " val_cvt[0] {rn-sae} = " << val_cvt[0] << ' ';

cout << " val_cvt[1] {rd-sae} = " << val_cvt[1] << ' ';

cout << " val_cvt[2] {ru-sae} = " << val_cvt[2] << ' ';

cout << " val_cvt[3] {rz-sae} = " << val_cvt[3] << ' ';

}

void ConvertF64ToF32(void)

{

float val_cvt[4];

const double val[] {M_PI, -M_SQRT2};

const int num_vals = sizeof(val) / sizeof(double);

cout << " ConvertF64ToF32 ";

for (int i = 0; i < num_vals; i++)

{

Avx512CvtF64ToF32_(val_cvt, val[i]);

cout << fixed << setprecision(7);

cout << " Test case #" << i << " val = " << val[i] << ' ';

cout << " val_cvt[0] {rn-sae} = " << val_cvt[0] << ' ';

cout << " val_cvt[1] {rd-sae} = " << val_cvt[1] << ' ';

cout << " val_cvt[2] {ru-sae} = " << val_cvt[2] << ' ';

cout << " val_cvt[3] {rz-sae} = " << val_cvt[3] << ' ';

}

int main()

{

ConvertF32ToU32();

ConvertF64ToU64();

ConvertF64ToF32();

return 0;

}

;-------------------------------------------------

; Ch13_03.asm

;-------------------------------------------------

; extern "C" void Avx512CvtF32ToU32_(uint32_t val_cvt[4], float val);

.code

Avx512CvtF32ToU32_ proc

vcvtss2usi eax,xmm1{rn-sae} ;Convert using round to nearest

mov dword ptr [rcx],eax

vcvtss2usi eax,xmm1{rd-sae} ;Convert using round down

mov dword ptr [rcx+4],eax

vcvtss2usi eax,xmm1{ru-sae} ;Convert using round up

mov dword ptr [rcx+8],eax

vcvtss2usi eax,xmm1{rz-sae} ;Convert using round to zero (truncate)

mov dword ptr [rcx+12],eax

ret

Avx512CvtF32ToU32_ endp

; extern "C" void Avx512CvtF64ToU64_(uint64_t val_cvt[4], double val);

Avx512CvtF64ToU64_ proc

vcvtsd2usi rax,xmm1{rn-sae}

mov qword ptr [rcx],rax

vcvtsd2usi rax,xmm1{rd-sae}

mov qword ptr [rcx+8],rax

vcvtsd2usi rax,xmm1{ru-sae}

mov qword ptr [rcx+16],rax

vcvtsd2usi rax,xmm1{rz-sae}

mov qword ptr [rcx+24],rax

ret

Avx512CvtF64ToU64_ endp

; extern "C" void Avx512CvtF64ToF32_(float val_cvt[4], double val);

Avx512CvtF64ToF32_ proc

vcvtsd2ss xmm2,xmm2,xmm1{rn-sae}

vmovss real4 ptr [rcx],xmm2

vcvtsd2ss xmm2,xmm2,xmm1{rd-sae}

vmovss real4 ptr [rcx+4],xmm2

vcvtsd2ss xmm2,xmm2,xmm1{ru-sae}

vmovss real4 ptr [rcx+8],xmm2

vcvtsd2ss xmm2,xmm2,xmm1{rz-sae}

vmovss real4 ptr [rcx+12],xmm2

ret

Avx512CvtF64ToF32_ endp

end

Listing 13-3.

Example Ch13_03

The C++ code in Listing 13-3 begins with the function ConvertF32ToU32. This function performs test case initialization and exercises the assembly language function Avx512CvtF32ToU32_, which converts a single-precision floating-point value to an unsigned doubleword (32-bit) integer using different rounding modes. The results are then streamed to cout. The C++ functions ConvertF64ToU64 and ConvertF64ToF32 carry out similar test case initializations for the assembly language functions Avx512CvtF64ToU64_ and Avx512CvtF64ToF32_, respectively.

The first instruction of assembly language function Avx512CvtF32ToU32_, vcvtss2usi eax,xmm1{rn-sae} converts the scalar single-precision floating-point value in XMM1 (or val) to an unsigned doubleword integer using the rounding mode round-to-nearest . As mentioned in Chapter 12, the -sae suffix that’s appended to the embedded rounding mode string is a reminder that floating-point exceptions and MXCSR flag updates are always disabled when an instruction-level rounding control operand is specified. The ensuing mov dword ptr [rcx],eax instruction saves the converted result in val_cvt[0]. Avx512CvtF32ToU32_ and then employs additional vcvtss2usi instructions to carry out the same conversion operation using rounding modes round-down , round-up , and round-to-zero . The organization of function Avx512CvtF64ToU64_ is similar to Avx512CvtF32ToU32_ and uses the vcvtsd2usi instruction to convert a double-precision floating-point value to an unsigned quadword integer. Note that both vcvtss2usi and vcvtsd2usi are new AVX-512 instructions. AVX-512 also includes the instructions vcvtusi2s[d|s], which perform unsigned integer to floating-point conversions. Neither AVX nor AVX2 include instructions that perform these types of conversions.

The final assembly language function, Avx512CvtF64ToF32_, applies the vcvtsd2ss instruction to convert a double-precision floating-point value to single-precision floating-point. The vcvtsd2ss instruction is an existing AVX instruction that can be used with an instruction-level rounding control operand on systems that support AVX-512. Here is the output for source code example Ch13_03.

ConvertF32ToU32

Test case #0 val = 3.14159

val_cvt[0] {rn-sae} = 3

val_cvt[1] {rd-sae} = 3

val_cvt[2] {ru-sae} = 4

val_cvt[3] {rz-sae} = 3

Test case #1 val = 1.41421

val_cvt[0] {rn-sae} = 1

val_cvt[1] {rd-sae} = 1

val_cvt[2] {ru-sae} = 2

val_cvt[3] {rz-sae} = 1

ConvertF64ToU64

Test case #0 val = 3.14159

val_cvt[0] {rn-sae} = 3

val_cvt[1] {rd-sae} = 3

val_cvt[2] {ru-sae} = 4

val_cvt[3] {rz-sae} = 3

Test case #1 val = 1.41421

val_cvt[0] {rn-sae} = 1

val_cvt[1] {rd-sae} = 1

val_cvt[2] {ru-sae} = 2

val_cvt[3] {rz-sae} = 1

ConvertF64ToF32

Test case #0 val = 3.1415927

val_cvt[0] {rn-sae} = 3.1415927

val_cvt[1] {rd-sae} = 3.1415925

val_cvt[2] {ru-sae} = 3.1415927

val_cvt[3] {rz-sae} = 3.1415925

Test case #1 val = -1.4142136

val_cvt[0] {rn-sae} = -1.4142135

val_cvt[1] {rd-sae} = -1.4142137

val_cvt[2] {ru-sae} = -1.4142135

val_cvt[3] {rz-sae} = -1.4142135

Packed Floating-Point

The source code examples of this section illustrate how to use AVX-512 instructions to carry out computations using packed floating-point operands. The first three source code examples demonstrate basic operations with 512-bit wide packed floating-point operands including simple arithmetic, compare operations, and merge masking . The remaining examples focus on specific algorithms including vector cross product calculations, matrix-vector multiplications, and convolutions.

Packed Floating-Point Arithmetic

Listing 13-4 shows the source code for example Ch13_04. This example demonstrates how to perform common arithmetic operations using 512-bit wide single-precision and double-precision floating-point operands. It also highlights some of the similarities between AVX/AVX2 and AVX-512 programming.

//------------------------------------------------

// ZmmVal.h

//------------------------------------------------

#pragma once

#include <string>

#include <cstdint>

#include <sstream>

#include <iomanip>

struct ZmmVal

{

public:

union

{

int8_t m_I8[64];

int16_t m_I16[32];

int32_t m_I32[16];

int64_t m_I64[8];

uint8_t m_U8[64];

uint16_t m_U16[32];

uint32_t m_U32[16];

uint64_t m_U64[8];

float m_F32[16];

double m_F64[8];

};

//------------------------------------------------

// Ch13_04.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#define _USE_MATH_DEFINES

#include <math.h>

#include "ZmmVal.h"

using namespace std;

extern "C" void Avx512PackedMathF32_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[8]);

extern "C" void Avx512PackedMathF64_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[8]);

void Avx512PackedMathF32(void)

{

alignas(64) ZmmVal a;

alignas(64) ZmmVal b;

alignas(64) ZmmVal c[8];

a.m_F32[0] = 36.0f; b.m_F32[0] = -0.1111111f;

a.m_F32[1] = 0.03125f; b.m_F32[1] = 64.0f;

a.m_F32[2] = 2.0f; b.m_F32[2] = -0.0625f;

a.m_F32[3] = 42.0f; b.m_F32[3] = 8.666667f;

a.m_F32[4] = 7.0f; b.m_F32[4] = -18.125f;

a.m_F32[5] = 20.5f; b.m_F32[5] = 56.0f;

a.m_F32[6] = 36.125f; b.m_F32[6] = 24.0f;

a.m_F32[7] = 0.5f; b.m_F32[7] = -158.6f;

a.m_F32[8] = 136.0f; b.m_F32[8] = -9.1111111f;

a.m_F32[9] = 2.03125f; b.m_F32[9] = 864.0f;

a.m_F32[10] = 32.0f; b.m_F32[10] = -70.0625f;

a.m_F32[11] = 442.0f; b.m_F32[11] = 98.666667f;

a.m_F32[12] = 57.0f; b.m_F32[12] = -518.125f;

a.m_F32[13] = 620.5f; b.m_F32[13] = 456.0f;

a.m_F32[14] = 736.125f; b.m_F32[14] = 324.0f;

a.m_F32[15] = 80.5f; b.m_F32[15] = -298.6f;

Avx512PackedMathF32_(&a, &b, c);

cout << (" Results for Avx512PackedMathF32 ");

for (int i = 0; i < 4; i++)

{

cout << "Group #" << i << ' ';

cout << " a: " << a.ToStringF32(i) << ' ';

cout << " b: " << b.ToStringF32(i) << ' ';

cout << " addps: " << c[0].ToStringF32(i) << ' ';

cout << " subps: " << c[1].ToStringF32(i) << ' ';

cout << " mulps: " << c[2].ToStringF32(i) << ' ';

cout << " divps: " << c[3].ToStringF32(i) << ' ';

cout << " absps: " << c[4].ToStringF32(i) << ' ';

cout << " sqrtps: " << c[5].ToStringF32(i) << ' ';

cout << " minps: " << c[6].ToStringF32(i) << ' ';

cout << " maxps: " << c[7].ToStringF32(i) << ' ';

cout << ' ';

}

void Avx512PackedMathF64(void)

{

alignas(64) ZmmVal a;

alignas(64) ZmmVal b;

alignas(64) ZmmVal c[8];

a.m_F64[0] = 2.0; b.m_F64[0] = M_PI;

a.m_F64[1] = 4.0 ; b.m_F64[1] = M_E;

a.m_F64[2] = 7.5; b.m_F64[2] = -9.125;

a.m_F64[3] = 3.0; b.m_F64[3] = -M_PI;

a.m_F64[4] = 12.0; b.m_F64[4] = M_PI / 2;

a.m_F64[5] = 24.0; b.m_F64[5] = M_E / 2;

a.m_F64[6] = 37.5; b.m_F64[6] = -9.125 / 2;

a.m_F64[7] = 43.0; b.m_F64[7] = -M_PI / 2;

Avx512PackedMathF64_(&a, &b, c);

cout << (" Results for Avx512PackedMathF64 ");

for (int i = 0; i < 4; i++)

{

cout << "Group #" << i << ' ';

cout << " a: " << a.ToStringF64(i) << ' ';

cout << " b: " << b.ToStringF64(i) << ' ';

cout << " addpd: " << c[0].ToStringF64(i) << ' ';

cout << " subpd: " << c[1].ToStringF64(i) << ' ';

cout << " mulpd: " << c[2].ToStringF64(i) << ' ';

cout << " divpd: " << c[3].ToStringF64(i) << ' ';

cout << " abspd: " << c[4].ToStringF64(i) << ' ';

cout << " sqrtpd: " << c[5].ToStringF64(i) << ' ';

cout << " minpd: " << c[6].ToStringF64(i) << ' ';

cout << " maxpd: " << c[7].ToStringF64(i) << ' ';

cout << ' ';

}

int main()

{

Avx512PackedMathF32();

Avx512PackedMathF64();

return 0;

}

;-------------------------------------------------

; Ch13_04.asm

;-------------------------------------------------

; Mask values used to calculate floating-point absolute values

ConstVals segment readonly align(64) 'const'

AbsMaskF32 dword 16 dup(7fffffffh)

AbsMaskF64 qword 8 dup(7fffffffffffffffh)

ConstVals ends

; extern "C" void Avx512PackedMathF32_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[8]);

.code

Avx512PackedMathF32_ proc

; Load packed SP floating-point values

vmovaps zmm0,zmmword ptr [rcx] ;zmm0 = *a

vmovaps zmm1,zmmword ptr [rdx] ;zmm1 = *b

; Packed SP floating-point addition

vaddps zmm2,zmm0,zmm1

vmovaps zmmword ptr [r8+0],zmm2

; Packed SP floating-point subtraction

vsubps zmm2,zmm0,zmm1

vmovaps zmmword ptr [r8+64],zmm2

; Packed SP floating-point multiplication

vmulps zmm2,zmm0,zmm1

vmovaps zmmword ptr [r8+128],zmm2

; Packed SP floating-point division

vdivps zmm2,zmm0,zmm1

vmovaps zmmword ptr [r8+192],zmm2

; Packed SP floating-point absolute value (b)

vandps zmm2,zmm1,zmmword ptr [AbsMaskF32]

vmovaps zmmword ptr [r8+256],zmm2

; Packed SP floating-point square root (a)

vsqrtps zmm2,zmm0

vmovaps zmmword ptr [r8+320],zmm2

; Packed SP floating-point minimum

vminps zmm2,zmm0,zmm1

vmovaps zmmword ptr [r8+384],zmm2

; Packed SP floating-point maximum

vmaxps zmm2,zmm0,zmm1

vmovaps zmmword ptr [r8+448],zmm2

vzeroupper

ret

Avx512PackedMathF32_ endp

; extern "C" void Avx512PackedMathF64_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[8]);

Avx512PackedMathF64_ proc

; Load packed DP floating-point values

vmovapd zmm0,zmmword ptr [rcx] ;zmm0 = *a

vmovapd zmm1,zmmword ptr [rdx] ;zmm1 = *b

; Packed DP floating-point addition

vaddpd zmm2,zmm0,zmm1

vmovapd zmmword ptr [r8+0],zmm2

; Packed DP floating-point subtraction

vsubpd zmm2,zmm0,zmm1

vmovapd zmmword ptr [r8+64],zmm2

; Packed DP floating-point multiplication

vmulpd zmm2,zmm0,zmm1

vmovapd zmmword ptr [r8+128],zmm2

; Packed DP floating-point division

vdivpd zmm2,zmm0,zmm1

vmovapd zmmword ptr [r8+192],zmm2

; Packed DP floating-point absolute value (b)

vandpd zmm2,zmm1,zmmword ptr [AbsMaskF64]

vmovapd zmmword ptr [r8+256],zmm2

; Packed DP floating-point square root (a)

vsqrtpd zmm2,zmm0

vmovapd zmmword ptr [r8+320],zmm2

; Packed DP floating-point minimum

vminpd zmm2,zmm0,zmm1

vmovapd zmmword ptr [r8+384],zmm2

; Packed DP floating-point maximum

vmaxpd zmm2,zmm0,zmm1

vmovapd zmmword ptr [r8+448],zmm2

vzeroupper

ret

Avx512PackedMathF64_ endp

end

Listing 13-4.

Example Ch13_04

Listing 13-4 starts with the declaration of the C++ structure ZmmVal , which is declared in the header file ZmmVal.h. This structure is analogous to the XmmVal and YmmVal structures that were used by the source code examples in Chapters 6 and 9. The structure ZmmVal contains a publicly-accessible anonymous union that simplifies packed operand data exchange between functions written in C++ and the x86 assembly language. The members of this union correspond to the packed data types that can be used with a ZMM register . The structure ZmmVal also includes several string formatting functions for display purposes (the source code for these member functions is not shown).

The remaining C++ code in Listing 13-4 is similar to the code that was used in example Ch09_01. The declarations for assembly language functions Avx512PackedMathF32_ and Avx512PackedMathF64_ follow the declaration of structure ZmmVal. These functions carry out various packed single-precision and double-precision floating-point arithmetic operations using the supplied ZmmVal arguments. The C++ functions Avx512PackedMathF32 and Avx512PackedMathF64 perform ZmmVal variable initializations, invoke the assembly language calculating functions, and display results. Note that the alignas(64) specifier is used with each ZmmVal variable definition.

The assembly language code in Listing 13-4 begins with a 64-byte aligned custom memory segment named ConstVals. This segment contains definitions for the packed constant values that are used in the calculating functions. A custom segment is used here since the MASM align directive does not support aligning data items on a 64-byte boundary. Chapter 9 contains additional information about custom memory segments. The segment ConstVals contains the constants AbsMaskF32 and AbsMaskF64, which are used to calculate absolute values for 512-bit wide packed single-precision and double-precision floating-point values.

The first instruction of Avx512PackedMathF32_, vmovaps zmm0,ymmword ptr [rcx], loads argument a (the 16 floating-point values in ZmmVal a) into register YMM0. The vmovaps can be used here since ZmmVal a was defined using the alignas(64) specifier . The operator zmmword ptr directs the assembler to treat the memory location pointed to by RCX as a 512-bit wide operand. Like the operators xmmword ptr and ymmword ptr, the zmmword ptr operator is often used to improve code readability even when it’s not explicitly required. The ensuing vmovaps zmm1,zmmword ptr [rdx] instruction loads ZmmVal b into register ZMM1. The vaddps zmm2,zmm0,zmm1 instruction that follows sums the packed single-precision floating-point values in ZMM0 and ZMM1 and saves the result in ZMM2. The vmovaps zmmword ptr [r8],zmm2 instruction saves the packed sums to c[0].

The ensuing vsubps, vmulps, and vdivps instructions carry out packed single-precision floating-point subtraction, multiplication, and division. This is followed by a vandps zmm2,zmm1,zmmword ptr [AbsMaskF32] instruction that calculates packed absolute values using argument b. The remaining instructions in Avx512PackedMathF32_ calculate packed single-precision floating-point square roots, minimums, and maximums.

Prior to its ret instruction, the function AvxPackedMath32_ uses a vzeroupper instruction, which zeros the high-order 384 bits of registers ZMM0–ZMM15. As explained in Chapter 4, a vzeroupper instruction is used here to avoid potential performance delays that can occur whenever the processor transitions from executing x86-AVX code to executing x86-SSE code. Any assembly language function that uses one or more YMM or ZMM registers and is callable from code that potentially uses x86-SSE instructions should ensure that a vzeroupper instruction is executed before program control is transferred back to the calling function. It should be noted that according to the Intel 64 and IA-32 Architectures Optimization Reference Manual, the vzeroupper use recommendations apply to functions that employ x86-AVX instructions with registers ZMM0–ZMM15 or YMM0–YMM15. Functions that only exploit registers ZMM16–ZMM31 or YMM16–YMM31 do not need to observe the vzeroupper use recommendations.

The organization of function Avx512PackedMathF64_ is similar to Avx512PackedMathF32_. Avx512PackedMathF64_ carries out its calculations using the double-precision versions of the same AVX-512 instructions that are used in Avx512PackedMathF32_. Here is the output for source code example Ch13_04:

Results for Avx512PackedMathF32

Group #0

a: 36.000000 0.031250 | 2.000000 42.000000

b: -0.111111 64.000000 | -0.062500 8.666667

addps: 35.888889 64.031250 | 1.937500 50.666668

subps: 36.111111 -63.968750 | 2.062500 33.333332

mulps: -4.000000 2.000000 | -0.125000 364.000000

divps: -324.000031 0.000488 | -32.000000 4.846154

absps: 0.111111 64.000000 | 0.062500 8.666667

sqrtps: 6.000000 0.176777 | 1.414214 6.480741

minps: -0.111111 0.031250 | -0.062500 8.666667

maxps: 36.000000 64.000000 | 2.000000 42.000000

Group #1

a: 7.000000 20.500000 | 36.125000 0.500000

b: -18.125000 56.000000 | 24.000000 -158.600006

addps: -11.125000 76.500000 | 60.125000 -158.100006

subps: 25.125000 -35.500000 | 12.125000 159.100006

mulps: -126.875000 1148.000000 | 867.000000 -79.300003

divps: -0.386207 0.366071 | 1.505208 -0.003153

absps: 18.125000 56.000000 | 24.000000 158.600006

sqrtps: 2.645751 4.527693 | 6.010407 0.707107

minps: -18.125000 20.500000 | 24.000000 -158.600006

maxps: 7.000000 56.000000 | 36.125000 0.500000

Group #2

a: 136.000000 2.031250 | 32.000000 442.000000

b: -9.111111 864.000000 | -70.062500 98.666664

addps: 126.888885 866.031250 | -38.062500 540.666687

subps: 145.111115 -861.968750 | 102.062500 343.333344

mulps: -1239.111084 1755.000000 | -2242.000000 43610.664063

divps: -14.926830 0.002351 | -0.456735 4.479730

absps: 9.111111 864.000000 | 70.062500 98.666664

sqrtps: 11.661903 1.425219 | 5.656854 21.023796

minps: -9.111111 2.031250 | -70.062500 98.666664

maxps: 136.000000 864.000000 | 32.000000 442.000000

Group #3

a: 57.000000 620.500000 | 736.125000 80.500000

b: -518.125000 456.000000 | 324.000000 -298.600006

addps: -461.125000 1076.500000 | 1060.125000 -218.100006

subps: 575.125000 164.500000 | 412.125000 379.100006

mulps: -29533.125000 282948.000000 | 238504.500000 -24037.300781

divps: -0.110012 1.360746 | 2.271991 -0.269591

absps: 518.125000 456.000000 | 324.000000 298.600006

sqrtps: 7.549834 24.909838 | 27.131624 8.972179

minps: -518.125000 456.000000 | 324.000000 -298.600006

maxps: 57.000000 620.500000 | 736.125000 80.500000

Results for Avx512PackedMathF64

Group #0

a: 2.000000000000 | 4.000000000000

b: 3.141592653590 | 2.718281828459

addpd: 5.141592653590 | 6.718281828459

subpd: -1.141592653590 | 1.281718171541

mulpd: 6.283185307180 | 10.873127313836

divpd: 0.636619772368 | 1.471517764686

abspd: 3.141592653590 | 2.718281828459

sqrtpd: 1.414213562373 | 2.000000000000

minpd: 2.000000000000 | 2.718281828459

maxpd: 3.141592653590 | 4.000000000000

Group #1

a: 7.500000000000 | 3.000000000000

b: -9.125000000000 | -3.141592653590

addpd: -1.625000000000 | -0.141592653590

subpd: 16.625000000000 | 6.141592653590

mulpd: -68.437500000000 | -9.424777960769

divpd: -0.821917808219 | -0.954929658551

abspd: 9.125000000000 | 3.141592653590

sqrtpd: 2.738612787526 | 1.732050807569

minpd: -9.125000000000 | -3.141592653590

maxpd: 7.500000000000 | 3.000000000000

Group #2

a: 12.000000000000 | 24.000000000000

b: 1.570796326795 | 1.359140914230

addpd: 13.570796326795 | 25.359140914230

subpd: 10.429203673205 | 22.640859085770

mulpd: 18.849555921539 | 32.619381941509

divpd: 7.639437268411 | 17.658213176229

abspd: 1.570796326795 | 1.359140914230

sqrtpd: 3.464101615138 | 4.898979485566

minpd: 1.570796326795 | 1.359140914230

maxpd: 12.000000000000 | 24.000000000000

Group #3

a: 37.500000000000 | 43.000000000000

b: -4.562500000000 | -1.570796326795

addpd: 32.937500000000 | 41.429203673205

subpd: 42.062500000000 | 44.570796326795

mulpd: -171.093750000000 | -67.544242052181

divpd: -8.219178082192 | -27.374650211806

abspd: 4.562500000000 | 1.570796326795

sqrtpd: 6.123724356958 | 6.557438524302

minpd: -4.562500000000 | -1.570796326795

maxpd: 37.500000000000 | 43.000000000000

Packed Floating-Point Compares

In Chapter 6 you learned how to use the vcmpp[s|d] instructions to perform packed single-precision and double-precision floating-point compare operations (see source code example Ch06_02). Recall that the AVX version of these instructions set the elements of a SIMD operand to all zeros or all ones to indicate the result of a compare operation. In this section, you learn how to use the AVX-512 version of the vcmpps instruction, which saves its compare result in an opmask register. Listing 13-5 shows the C++ and assembly language code for example Ch13_05.

//------------------------------------------------

// Ch13_05.cpp

//------------------------------------------------

#include "stdafx.h"

#include <cstdint>

#include <iostream>

#include <iomanip>

#include <limits>

#include "ZmmVal.h"

using namespace std;

extern "C" void Avx512PackedCompareF32_(const ZmmVal* a, const ZmmVal* b, uint16_t c[8]);

const char* c_CmpStr[8] =

{

"EQ", "NE", "LT", "LE", "GT", "GE", "ORDERED", "UNORDERED"

};

void ToZmmVal(ZmmVal des[8], uint16_t src[8])

{

for (size_t i = 0; i < 8; i++)

{

uint16_t val_src = src[i];

for (size_t j = 0; j < 16; j++)

des[i].m_U32[j] = val_src & (1 << j) ? 1 : 0;

}

void Avx512PackedCompareF32(void)

{

alignas(64) ZmmVal a;

alignas(64) ZmmVal b;

uint16_t c[8];

a.m_F32[0] = 2.0; b.m_F32[0] = 1.0;

a.m_F32[1] = 7.0; b.m_F32[1] = 12.0;

a.m_F32[2] = -6.0; b.m_F32[2] = -6.0;

a.m_F32[3] = 3.0; b.m_F32[3] = 8.0;

a.m_F32[4] = -2.0; b.m_F32[4] = 1.0;

a.m_F32[5] = 17.0; b.m_F32[5] = 17.0;

a.m_F32[6] = 6.5; b.m_F32[6] = -9.125;

a.m_F32[7] = 4.875; b.m_F32[7] = numeric_limits<float>::quiet_NaN();

a.m_F32[8] = 2.0; b.m_F32[8] = 101.0;

a.m_F32[9] = 7.0; b.m_F32[9] = -312.0;

a.m_F32[10] = -5.0; b.m_F32[10] = 15.0;

a.m_F32[11] = -33.0; b.m_F32[11] = -33.0;

a.m_F32[12] = -12.0; b.m_F32[12] = 198.0;

a.m_F32[13] = 107.0; b.m_F32[13] = 107.0;

a.m_F32[14] = 16.125; b.m_F32[14] = -2.75;

a.m_F32[15] = 42.875; b.m_F32[15] = numeric_limits<float>::quiet_NaN();

Avx512PackedCompareF32_(&a, &b, c);

cout << " Results for Avx512PackedCompareF32 ";

ZmmVal c_display[8];

ToZmmVal(c_display, c);

for (int sel = 0; sel < 4; sel++)

{

cout << setw(12) << "a[" << sel << "]:" << a.ToStringF32(sel) << ' ';

cout << setw(12) << "b[" << sel << "]:" << b.ToStringF32(sel) << ' ';

cout << ' ';

for (int j = 0; j < 8; j++)

cout << setw(14) << c_CmpStr[j] << ':' << c_display[j].ToStringU32(sel) << ' ';

cout << ' ';

}

int main()

{

Avx512PackedCompareF32();

return 0;

}

;-------------------------------------------------

; Ch13_05.asm

;-------------------------------------------------

include <cmpequ.asmh>

; extern "C" void Avx512PackedCompareF32_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[8]);

.code

Avx512PackedCompareF32_ proc

vmovaps zmm0,[rcx] ;zmm0 = a

vmovaps zmm1,[rdx] ;zmm1 = b

; Perform packed EQUAL compare

vcmpps k1,zmm0,zmm1,CMP_EQ

kmovw word ptr [r8],k1

; Perform packed NOT EQUAL compare

vcmpps k1,zmm0,zmm1,CMP_NEQ

kmovw word ptr [r8+2],k1

; Perform packed LESS THAN compare

vcmpps k1,zmm0,zmm1,CMP_LT

kmovw word ptr [r8+4],k1

; Perform packed LESS THAN OR EQUAL compare

vcmpps k1,zmm0,zmm1,CMP_LE

kmovw word ptr [r8+6],k1

; Perform packed GREATER THAN compare

vcmpps k1,zmm0,zmm1,CMP_GT

kmovw word ptr [r8+8],k1

; Perform packed GREATER THAN OR EQUAL compare

vcmpps k1,zmm0,zmm1,CMP_GE

kmovw word ptr [r8+10],k1

; Perform packed ORDERED compare

vcmpps k1,zmm0,zmm1,CMP_ORD

kmovw word ptr [r8+12],k1

; Perform packed UNORDERED compare

vcmpps k1,zmm0,zmm1,CMP_UNORD

kmovw word ptr [r8+14],k1

vzeroupper

ret

Avx512PackedCompareF32_ endp

end

Listing 13-5.

Example Ch13_05

The C++ function Avx512PackedCompareF32 that’s shown in Listing 13-5 starts its execution by loading test values into the single-precision floating-point elements of ZmmVal variables a and b. Note that these variables are defined using the C++ alignas(64) specifier . Following variable initialization, the function Avx512PackedCompareF32 invokes the assembly language function Avx512PackedCompareF32_ to perform the packed compares. It then streams the results to cout.

The assembly language function Avx512PackedCompareF32_ begins its execution with two vmovaps instructions that load ZmmVal variables a and b into registers ZMM0 and ZMM1, respectively. The ensuing vcmpps k1,zmm0,zmm1,CMP_EQ instruction compares the single-precision floating-point elements in registers ZMM0 and ZMM1 for equality. For each element position, this instruction sets the corresponding bit position in opmask register K1 to one if the values in ZMM0 and ZMM1 are equal; otherwise, the opmask register bit is set to zero. Figure 13-1 illustrates this operation in greater detail. The kmovw word ptr [r8],k1 instruction that follows saves the resultant mask to c[0].

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig1_HTML.jpg — Figure 13-1.
Example execution of the *vcmpps k1,zmm0,zmm1,CMP_EQ* instruction

The remaining code in Avx512PackedCompareF32_ performs additional compare operations using the vcmpps instruction, ZmmVal variables a and b, and common compare predicates. Note that like the previous example, Avx512PackedCompareF32_ uses a vzeroupper instruction prior to its ret instruction. Here are the results for source code example Ch13_05.

Results for Avx512PackedCompareF32

a[0]: 2.000000 7.000000 | -6.000000 3.000000

b[0]: 1.000000 12.000000 | -6.000000 8.000000

EQ: 0 0 | 1 0

NE: 1 1 | 0 1

LT: 0 1 | 0 1

LE: 0 1 | 1 1

GT: 1 0 | 0 0

GE: 1 0 | 1 0

ORDERED: 1 1 | 1 1

UNORDERED: 0 0 | 0 0

a[1]: -2.000000 17.000000 | 6.500000 4.875000

b[1]: 1.000000 17.000000 | -9.125000 nan

EQ: 0 1 | 0 0

NE: 1 0 | 1 1

LT: 1 0 | 0 0

LE: 1 1 | 0 0

GT: 0 0 | 1 0

GE: 0 1 | 1 0

ORDERED: 1 1 | 1 0

UNORDERED: 0 0 | 0 1

a[2]: 2.000000 7.000000 | -5.000000 -33.000000

b[2]: 101.000000 -312.000000 | 15.000000 -33.000000

EQ: 0 0 | 0 1

NE: 1 1 | 1 0

LT: 1 0 | 1 0

LE: 1 0 | 1 1

GT: 0 1 | 0 0

GE: 0 1 | 0 1

ORDERED: 1 1 | 1 1

UNORDERED: 0 0 | 0 0

a[3]: -12.000000 107.000000 | 16.125000 42.875000

b[3]: 198.000000 107.000000 | -2.750000 nan

EQ: 0 1 | 0 0

NE: 1 0 | 1 1

LT: 1 0 | 0 0

LE: 1 1 | 0 0

GT: 0 0 | 1 0

GE: 0 1 | 1 0

ORDERED: 1 1 | 1 0

UNORDERED: 0 0 | 0 1

On systems that support AVX-512, assembly language functions can also use the vcmppd instruction with a destination operand opmask register to perform packed double-precision floating-point compares. In these instances, the resultant mask is saved in the low-order eight bits of the destination operand opmask register.

Packed Floating-Point Column Means

Listing 13-6 shows the source code for example Ch13_06. This example, which is an AVX-512 implementation of source code example Ch09_03, calculates columns means for a two-dimensional array of double-precision floating-point values. To make the current source code example a little more interesting, the column means are calculated using only the array elements that are above a predetermined threshold value.

//------------------------------------------------

// Ch13_06.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#include <random>

#include <memory>

using namespace std;

// Test size limits to illustrate argument checking

extern "C" size_t c_NumRowsMax = 1000000;

extern "C" size_t c_NumColsMax = 1000000;

extern "C" bool Avx512CalcColumnMeans_(const double* x, size_t nrows, size_t ncols, double* col_means, size_t* col_counts, double x_min);

void Init(double* x, size_t n, int rng_min, int rng_max, unsigned int seed)

{

uniform_int_distribution<> ui_dist {rng_min, rng_max};

default_random_engine rng {seed};

for (size_t i = 0; i < n; i++)

x[i] = (double)ui_dist(rng);

}

bool Avx512CalcColumnMeansCpp(const double* x, size_t nrows, size_t ncols, double* col_means, size_t* col_counts, double x_min)

{

// Make sure nrows and ncols are valid

if (nrows == 0 || nrows > c_NumRowsMax)

return false;

if (ncols == 0 || ncols > c_NumColsMax)

return false;

// Initialize column means and column counts to zero

for (size_t i = 0; i < ncols; i++)

{

col_means[i] = 0.0;

col_counts[i] = 0;

}

// Calculate column means

for (size_t i = 0; i < nrows; i++)

{

for (size_t j = 0; j < ncols; j++)

{

double val = x[i * ncols + j];

if (val >= x_min)

{

col_means[j] += val;

col_counts[j]++;

}

for (size_t j = 0; j < ncols; j++)

col_means[j] /= col_counts[j];

return true;

}

void Avx512CalcColumnMeans(void)

{

const size_t nrows = 20000;

const size_t ncols = 23;

const int rng_min = 1;

const int rng_max = 999;

const unsigned int rng_seed = 47;

const double x_min = 75.0;

unique_ptr<double[]> x {new double[nrows * ncols]};

unique_ptr<double[]> col_means1 {new double[ncols]};

unique_ptr<double[]> col_means2 {new double[ncols]};

unique_ptr<size_t[]> col_counts1 {new size_t[ncols]};

unique_ptr<size_t[]> col_counts2 {new size_t[ncols]};

Init(x.get(), nrows * ncols, rng_min, rng_max, rng_seed);

bool rc1 = Avx512CalcColumnMeansCpp(x.get(), nrows, ncols, col_means1.get(), col_counts1.get(), x_min);

bool rc2 = Avx512CalcColumnMeans_(x.get(), nrows, ncols, col_means2.get(), col_counts2.get(), x_min);

cout << "Results for Avx512CalcColumnMeans ";

if (!rc1 || !rc2)

{

cout << "Invalid return code: ";

cout << "rc1 = " << boolalpha << rc1 << ", ";

cout << "rc2 = " << boolalpha << rc2 << ' ';

return;

}

cout << "Test Matrix (nrows = " << nrows << ", ncols = " << ncols << ") ";

cout << " Column Means ";

cout << fixed << setprecision(4);

for (size_t j = 0; j < ncols; j++)

{

cout << setw(4) << j << ": ";

cout << "col_means = ";

cout << setw(10) << col_means1[j] << ", ";

cout << setw(10) << col_means2[j] << " ";

cout << "col_counts = ";

cout << setw(6) << col_counts1[j] << ", ";

cout << setw(6) << col_counts2[j] << ' ';

if (col_means1[j] != col_means2[j])

cout << "col_means compare error ";

if (col_counts1[j] != col_counts2[j])

cout << "col_counts compare error ";

}

int main()

{

Avx512CalcColumnMeans();

return 0;

}

;-------------------------------------------------

; Ch13_06.asm

;-------------------------------------------------

include <cmpequ.asmh>

include <MacrosX86-64-AVX.asmh>

extern c_NumRowsMax:qword

extern c_NumColsMax:qword

; extern "C" bool Avx512CalcColumnMeans_(const double* x, size_t nrows, size_t ncols, double* col_means, size_t* col_counts, double x_min);

.code

Avx512CalcColumnMeans_ proc frame

_CreateFrame CCM_,0,0,rbx,r12,r13

_EndProlog

; Validate nrows and ncols

xor eax,eax ;set error return code

test rdx,rdx

jz Done ;jump if nrows is zero

cmp rdx,[c_NumRowsMax]

ja Done ;jump if nrows is too large

test r8,r8

jz Done ;jump if ncols is zero

cmp r8,[c_NumColsMax]

ja Done ;jump if ncols is too large

; Load argument values col_counts and x_min

mov ebx,1

vpbroadcastq zmm4,rbx ;zmm4 = 8 qwords of 1

mov rbx,[rbp+CCM_OffsetStackArgs] ;rbx = col_counts ptr

lea r13,[rbp+CCM_OffsetStackArgs+8] ;r13 = ptr to x_min

; Set initial col_means and col_counts to zero

xor r10,r10

vxorpd xmm0,xmm0,xmm0

@@: vmovsd real8 ptr[r9+rax*8],xmm0 ;col_means[i] = 0.0

mov [rbx+rax*8],r10 ;col_counts[i] = 0

inc rax

cmp rax,r8

jne @B ;repeat until done

; Compute the sum of each column in x

LP1: xor r10,r10 ;r10 = col_index

mov r11,r9 ;r11 = ptr to col_means

mov r12,rbx ;r12 = ptr to col_counts

LP2: mov rax,r10 ;rax = col_index

add rax,8

cmp rax,r8 ;8 or more columns remaining?

ja @F ;jump if col_index + 8 > ncols

; Update col_means and col_counts using next eight columns

vmovupd zmm0,zmmword ptr [rcx] ;load next 8 cols of cur row

vcmppd k1,zmm0,real8 bcst [r13],CMP_GE ;k1 = mask of values >= x_min

vmovupd zmm1{k1}{z},zmm0 ;values >= x_min or 0.0

vaddpd zmm2,zmm1,zmmword ptr [r11] ;add values to col_means

vmovupd zmmword ptr [r11],zmm2 ;save updated col_means

vpmovm2q zmm0,k1 ;convert mask to vector

vpandq zmm1,zmm0,zmm4 ;qword values for add

vpaddq zmm2,zmm1,zmmword ptr [r12] ;update col_counts

vmovdqu64 zmmword ptr [r12],zmm2 ;save updated col_counts

add r10,8 ;col_index += 8

add rcx,64 ;x += 8

add r11,64 ;col_means += 8

add r12,64 ;col_counts += 8

jmp NextColSet

; Update col_means and col_counts using next four columns

@@: sub rax,4

cmp rax,r8 ;4 or more columns remaining?

ja @F ;jump if col_index + 4 > ncols

vmovupd ymm0,ymmword ptr [rcx] ;load next 4 cols of cur row

vcmppd k1,ymm0,real8 bcst [r13],CMP_GE ;k1 = mask of values >= x_min

vmovupd ymm1{k1}{z},ymm0 ;values >= x_min or 0.0

vaddpd ymm2,ymm1,ymmword ptr [r11] ;add values to col_means

vmovupd ymmword ptr [r11],ymm2 ;save updated col_means

vpmovm2q ymm0,k1 ;convert mask to vector

vpandq ymm1,ymm0,ymm4 ;qword values for add

vpaddq ymm2,ymm1,ymmword ptr [r12] ;update col_counts

vmovdqu64 ymmword ptr [r12],ymm2 ;save updated col_counts

add r10,4 ;col_index += 4

add rcx,32 ;x += 4

add r11,32 ;col_means += 4

add r12,32 ;col_counts += 4

jmp NextColSet

; Update col_means and col_counts using next two columns

@@: sub rax,2

cmp rax,r8 ;2 or more columns remaining?

ja @F ;jump if col_index + 2 > ncols

vmovupd xmm0,xmmword ptr [rcx] ;load next 2 cols of cur row

vcmppd k1,xmm0,real8 bcst [r13],CMP_GE ;k1 = mask of values >= x_min

vmovupd xmm1{k1}{z},xmm0 ;values >= x_min or 0.0

vaddpd xmm2,xmm1,xmmword ptr [r11] ;add values to col_means

vmovupd xmmword ptr [r11],xmm2 ;save updated col_means

vpmovm2q xmm0,k1 ;convert mask to vector

vpandq xmm1,xmm0,xmm4 ;qword values for add

vpaddq xmm2,xmm1,xmmword ptr [r12] ;update col_counts

vmovdqu64 xmmword ptr [r12],xmm2 ;save updated col_counts

add r10,2 ;col_index += 2

add rcx,16 ;x += 2

add r11,16 ;col_means += 2

add r12,16 ;col_counts += 2

jmp NextColSet

; Update col_means using last column of current row

@@: vmovsd xmm0,real8 ptr [rcx] ;load x from last column

vcmpsd k1,xmm0,real8 ptr [r13],CMP_GE ;k1 = mask of values >= x_min

vmovsd xmm1{k1}{z},xmm1,xmm0 ;value or 0.0

vaddsd xmm2,xmm1,real8 ptr [r11] ;add to col_means

vmovsd real8 ptr [r11],xmm2 ;save updated col_means

kmovb eax,k1 ;eax = 0 or 1

add qword ptr [r12],rax ;update col_counts

add r10,1 ;col_index += 1

add rcx,8 ;update x ptr

NextColSet:

cmp r10,r8 ;more columns in current row?

jb LP2 ;jump if yes

dec rdx ;nrows -= 1

jnz LP1 ;jump if more rows

; Compute the final col_means

@@: vmovsd xmm0,real8 ptr [r9] ;xmm0 = col_means[i]

vcvtsi2sd xmm1,xmm1,qword ptr [rbx] ;xmm1 = col_counts[i]

vdivsd xmm2,xmm0,xmm1 ;compute final mean

vmovsd real8 ptr [r9],xmm2 ;save col_mean[i]

add r9,8 ;update col_means ptr

add rbx,8 ;update col_counts ptr

sub r8,1 ;ncols -= 1

jnz @B ;repeat until done

mov eax,1 ;set success return code

Done: _DeleteFrame rbx,r12,r13

vzeroupper

ret

Avx512CalcColumnMeans_ endp

end

Listing 13-6.

Example Ch13_06

The function Avx512CalcColumnMeansCpp contains a C++ implementation of the columns means algorithm. This function uses two nested for loops to sum the elements of each column in the two-dimensional array. During each inner loop iteration, the value of array element x[i][j] is added to the current column running sum in col_means[j] only if it’s greater than or equal to x_min. The number of elements greater than or equal to x_min in each column is maintained in the array col_counts. Following the summing loops, the final column means are calculated using a simple for loop.

Following its prolog, the function Avx512CalcColumnMeans_ validates argument values nrows and ncols. It then performs its required initializations. The mov ebx,1 and vpbroadcastq zmm4,rbx instructions load the value one into each quadword element of ZMM4. Registers RBX and R13 are then initialized as pointers to col_counts and x_min, respectively. The final initialization task employs a simple for loop that sets each element in col_means and col_counts to zero.

Similar to source code example Ch09_03, the inner for loop in Avx512CalcColumnMeans_ employs slightly different instruction sequences to sum column elements, which vary depending on the number of columns in the array (see Figure 9-2) and the current column index. For each row, elements in the first eight columns of x can be added to col_means using 512-bit wide packed double-precision floating-point addition. The remaining column element values are added to col_means using 512-, 256-, or 128-bit wide packed or scalar double-precision floating-point addition.

The outer loop label LP1 is the starting point for adding elements from the current row of x to col_means. The xor r10,r10 instruction initializes col_index to zero; the mov r11,r9 instruction loads R11 with a pointer to col_means; and mov r12,rbx points R12 to col_counts. Each iteration of the inner loop LP2 begins with a check to ensure that at least eight columns are available in the current row for processing. If eight columns are available, the vmovupd zmm0,zmmword ptr [rcx] instruction loads the next eight elements of the current row into register ZMM0. The ensuing vcmppd k1,zmm0,real8 bcst [r13],CMP_GE instruction compares each element in ZMM0 to x_min and sets the corresponding bit position in opmask register K1 to indicate the result. Note that the embedded broadcast operand of the vpcmpd instruction is used here for demonstration purposes. In this source code example, it would be more efficient to initialize a packed version of x_min prior to the start of the processing loops. The next instruction, vmovupd zmm1{k1}{z},zmm0, uses zero masking to effectively eliminate values less than x_min from subsequent calculations. The next two instructions, vaddpd zmm2,zmm1,zmmword ptr [r11] and vmovupd zmmword ptr [r11],zmm2, update the running column sums that are maintained in the col_means. Figure 13-2 illustrates this operation in greater detail.

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig2_HTML.jpg — Figure 13-2.
Updating the intermediate sums in *col_means* using zero merging

The next code block updates the element counts in col_counts. The vpmovm2q zmm0,k1 instruction (Convert Mask Register to Vector Register) sets each quadword element in ZMM0 to all ones (0xFFFFFFFFFFFFFFFF) or all zeros (0x0000000000000000) according to the value of the corresponding bit position in K1. The ensuing vpandq zmm1,zmm0,zmm4 instruction zeros the high-order 63 bits of each quadword value in ZMM0 and saves this result in ZMM1. The next two instructions, vpaddq zmm2,zmm1,zmmword ptr [r12] and vmovdqu64 zmmword ptr [r12],zmm2, update the count values in col_counts, as shown in Figure 13-3. The vmovdqu64 instruction saves the 512-bit wide packed quadword operand in ZMM2 to the location pointed to by register R12. AVX512F also includes a vmovdqu32 instruction for 512-bit wide packed doubleword moves.

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig3_HTML.jpg — Figure 13-3.
Updating the intermediate element counts in *col_counts*

Following execution of the vmovdqu64 instruction, the algorithm’s various pointers and counters are updated to reflect the eight processed elements. The summation code repeats until the number of array elements that remain in the current row is less than eight. Once this condition is met, the remaining column elements (if any) are processed using 256-, 128-, or 64-bit wide operands using the same technique described in the previous paragraph. Note that function Avx512CalcColumnMeans_ uses AVX-512 instructions that employ YMM or XMM registers with embedded broadcast and zero merging operands. These instructions require an AVX-512 conforming processor that supports the AVX512VL instruction set extension. After calculating the column sums, each element in col_means is divided by the corresponding element in col_counts to obtain the final column mean. Here are the results for source code example Ch13_06:

Results for Avx512CalcColumnMeans

Test Matrix (nrows = 20000, ncols = 23)

Column Means

0: col_means = 536.6483, 536.6483 col_counts = 18548, 18548

1: col_means = 535.8669, 535.8669 col_counts = 18538, 18538

2: col_means = 534.7049, 534.7049 col_counts = 18457, 18457

3: col_means = 535.8747, 535.8747 col_counts = 18544, 18544

4: col_means = 540.7477, 540.7477 col_counts = 18501, 18501

5: col_means = 535.9465, 535.9465 col_counts = 18493, 18493

6: col_means = 539.0142, 539.0142 col_counts = 18528, 18528

7: col_means = 536.6623, 536.6623 col_counts = 18496, 18496

8: col_means = 532.1445, 532.1445 col_counts = 18486, 18486

9: col_means = 543.4736, 543.4736 col_counts = 18479, 18479

10: col_means = 535.2980, 535.2980 col_counts = 18552, 18552

11: col_means = 536.4255, 536.4255 col_counts = 18486, 18486

12: col_means = 537.6472, 537.6472 col_counts = 18473, 18473

13: col_means = 537.9775, 537.9775 col_counts = 18511, 18511

14: col_means = 538.4742, 538.4742 col_counts = 18514, 18514

15: col_means = 539.2965, 539.2965 col_counts = 18497, 18497

16: col_means = 537.9710, 537.9710 col_counts = 18454, 18454

17: col_means = 536.7826, 536.7826 col_counts = 18566, 18566

18: col_means = 538.3274, 538.3274 col_counts = 18452, 18452

19: col_means = 538.2181, 538.2181 col_counts = 18491, 18491

20: col_means = 532.6881, 532.6881 col_counts = 18514, 18514

21: col_means = 537.0067, 537.0067 col_counts = 18554, 18554

22: col_means = 539.0643, 539.0643 col_counts = 18548, 18548

Vector Cross Products

The next source code example, Ch13_07, demonstrates vector cross product calculations using arrays of three-dimensional vectors. It also illustrates how to perform data gather and scatter operations using AVX-512 instructions. Listing 13-7 shows the source code for example Ch13_07

//------------------------------------------------

// Ch13_07.h

//------------------------------------------------

#pragma once

// Simple vector structure

typedef struct

{

double X; // Vector X component

double Y; // Vector Y component

double Z; // Vector Z component

} Vector;

// Vector structure of arrays

typedef struct

{

double* X; // Pointer to X components

double* Y; // Pointer to Y components

double* Z; // Pointer to Z components

} VectorSoA;

// Ch13_07.cpp

void InitVec(Vector* a_aos, Vector* b_aos, VectorSoA& a_soa, VectorSoA& b_soa, size_t num_vec);

bool Avx512VcpAosCpp(Vector* c, const Vector* a, const Vector* b, size_t num_vec);

bool Avx512VcpSoaCpp(VectorSoA* c, const VectorSoA* a, const VectorSoA* b, size_t num_vec);

// Ch13_07_.asm

extern "C" bool Avx512VcpAos_(Vector* c, const Vector* a, const Vector* b, size_t num_vec);

extern "C" bool Avx512VcpSoa_(VectorSoA* c, const VectorSoA* a, const VectorSoA* b, size_t num_vec);

// Ch13_07_BM.cpp

extern void Avx512Vcp_BM(void);

//------------------------------------------------

// Ch13_07.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#include <random>

#include <memory>

#include "Ch13_07.h"

#include "AlignedMem.h"

using namespace std;

void InitVec(Vector* a_aos, Vector* b_aos, VectorSoA& a_soa, VectorSoA& b_soa, size_t num_vec)

{

uniform_int_distribution<> ui_dist {1, 100};

default_random_engine rng {103};

for (size_t i = 0; i < num_vec; i++)

{

double a_x = (double)ui_dist(rng);

double a_y = (double)ui_dist(rng);

double a_z = (double)ui_dist(rng);

double b_x = (double)ui_dist(rng);

double b_y = (double)ui_dist(rng);

double b_z = (double)ui_dist(rng);

a_aos[i].X = a_soa.X[i] = a_x;

a_aos[i].Y = a_soa.Y[i] = a_y;

a_aos[i].Z = a_soa.Z[i] = a_z;

b_aos[i].X = b_soa.X[i] = b_x;

b_aos[i].Y = b_soa.Y[i] = b_y;

b_aos[i].Z = b_soa.Z[i] = b_z;

}

void Avx512Vcp(void)

{

const size_t align = 64;

const size_t num_vec = 16;

unique_ptr<Vector> a_aos_up {new Vector[num_vec] };

unique_ptr<Vector> b_aos_up {new Vector[num_vec] };

unique_ptr<Vector> c_aos_up {new Vector[num_vec] };

Vector* a_aos = a_aos_up.get();

Vector* b_aos = b_aos_up.get();

Vector* c_aos = c_aos_up.get();

VectorSoA a_soa, b_soa, c_soa;

AlignedArray<double> a_soa_x_aa(num_vec, align);

AlignedArray<double> a_soa_y_aa(num_vec, align);

AlignedArray<double> a_soa_z_aa(num_vec, align);

AlignedArray<double> b_soa_x_aa(num_vec, align);

AlignedArray<double> b_soa_y_aa(num_vec, align);

AlignedArray<double> b_soa_z_aa(num_vec, align);

AlignedArray<double> c_soa_x_aa(num_vec, align);

AlignedArray<double> c_soa_y_aa(num_vec, align);

AlignedArray<double> c_soa_z_aa(num_vec, align);

a_soa.X = a_soa_x_aa.Data();

a_soa.Y = a_soa_y_aa.Data();

a_soa.Z = a_soa_z_aa.Data();

b_soa.X = b_soa_x_aa.Data();

b_soa.Y = b_soa_y_aa.Data();

b_soa.Z = b_soa_z_aa.Data();

c_soa.X = c_soa_x_aa.Data();

c_soa.Y = c_soa_y_aa.Data();

c_soa.Z = c_soa_z_aa.Data();

InitVec(a_aos, b_aos, a_soa, b_soa, num_vec);

bool rc1 = Avx512VcpAos_(c_aos, a_aos, b_aos, num_vec);

bool rc2 = Avx512VcpSoa_(&c_soa, &a_soa, &b_soa, num_vec);

cout << "Results for Avx512VectorCrossProd ";

if (!rc1 || !rc2)

{

cout << "Invalid return code - ";

cout << "rc1 = " << boolalpha << rc1 << ", ";

cout << "rc2 = " << boolalpha << rc2 << ", ";

return;

}

cout << fixed << setprecision(1);

for (size_t i = 0; i < num_vec; i++)

{

cout << "Vector cross product #" << i << ' ';

const unsigned int w = 9;

cout << " a: ";

cout << setw(w) << a_aos[i].X << ' ';

cout << setw(w) << a_aos[i].Y << ' ';

cout << setw(w) << a_aos[i].Z << ' ';

cout << " b: ";

cout << setw(w) << b_aos[i].X << ' ';

cout << setw(w) << b_aos[i].Y << ' ';

cout << setw(w) << b_aos[i].Z << ' ';

cout << " c_aos: ";

cout << setw(w) << c_aos[i].X << ' ';

cout << setw(w) << c_aos[i].Y << ' ';

cout << setw(w) << c_aos[i].Z << ' ';

cout << " c_soa: ";

cout << setw(w) << c_soa.X[i] << ' ';

cout << setw(w) << c_soa.Y[i] << ' ';

cout << setw(w) << c_soa.Z[i] << ' ';

bool is_valid_x = c_aos[i].X == c_soa.X[i];

bool is_valid_y = c_aos[i].Y == c_soa.Y[i];

bool is_valid_z = c_aos[i].Z == c_soa.Z[i];

if (!is_valid_x || !is_valid_y || !is_valid_z)

{

cout << "Compare error at index " << i << ' ';

cout << " is_valid_x = " << boolalpha << is_valid_x << ' ';

cout << " is_valid_y = " << boolalpha << is_valid_y << ' ';

cout << " is_valid_z = " << boolalpha << is_valid_z << ' ';

return;

}

int main()

{

Avx512Vcp();

Avx512Vcp_BM();

return 0;

}

;-------------------------------------------------

; Ch13_07.asm

;-------------------------------------------------

include <MacrosX86-64-AVX.asmh>

; Indices for gather and scatter instructions

ConstVals segment readonly align(64) 'const'

GS_X qword 0, 3, 6, 9, 12, 15, 18, 21

GS_Y qword 1, 4, 7, 10, 13, 16, 19, 22

GS_Z qword 2, 5, 8, 11, 14, 17, 20, 23

ConstVals ends

; extern "C" bool Avx512VcpAos_(Vector* c, const Vector* a, const Vector* b, size_t num_vectors);

.code

Avx512VcpAos_ proc

; Make sure num_vec is valid

xor eax,eax ;set error code (also i = 0)

test r9,r9

jz Done ;jump if num_vec is zero

test r9,07h

jnz Done ;jump if num_vec % 8 != 0 is true

; Load indices for gather and scatter operations

vmovdqa64 zmm29,zmmword ptr [GS_X] ;zmm29 = X component indices

vmovdqa64 zmm30,zmmword ptr [GS_Y] ;zmm30 = Y component indices

vmovdqa64 zmm31,zmmword ptr [GS_Z] ;zmm31 = Z component indices

; Load next 8 vectors

align 16

@@: kxnorb k1,k1,k1

vgatherqpd zmm0{k1},[rdx+zmm29*8] ;zmm0 = A.X values

kxnorb k2,k2,k2

vgatherqpd zmm1{k2},[rdx+zmm30*8] ;zmm1 = A.Y values

kxnorb k3,k3,k3

vgatherqpd zmm2{k3},[rdx+zmm31*8] ;zmm2 = A.Z values

kxnorb k4,k4,k4

vgatherqpd zmm3{k4},[r8+zmm29*8] ;zmm3 = B.X values

kxnorb k5,k5,k5

vgatherqpd zmm4{k5},[r8+zmm30*8] ;zmm4 = B.Y values

kxnorb k6,k6,k6

vgatherqpd zmm5{k6},[r8+zmm31*8] ;zmm5 = B.Z values

; Calculate 8 vector cross products

vmulpd zmm16,zmm1,zmm5

vmulpd zmm17,zmm2,zmm4

vsubpd zmm18,zmm16,zmm17 ;c.X = a.Y * b.Z - a.Z * b.Y

vmulpd zmm19,zmm2,zmm3

vmulpd zmm20,zmm0,zmm5

vsubpd zmm21,zmm19,zmm20 ;c.Y = a.Z * b.X - a.X * b.Z

vmulpd zmm22,zmm0,zmm4

vmulpd zmm23,zmm1,zmm3

vsubpd zmm24,zmm22,zmm23 ;c.Z = a.X * b.Y - a.Y * b.X

; Save calculated cross products

kxnorb k4,k4,k4

vscatterqpd [rcx+zmm29*8]{k4},zmm18 ;save C.X components

kxnorb k5,k5,k5

vscatterqpd [rcx+zmm30*8]{k5},zmm21 ;save C.Y components

kxnorb k6,k6,k6

vscatterqpd [rcx+zmm31*8]{k6},zmm24 ;save C.Z components

; Update pointers and counters

add rcx,192 ;c += 8

add rdx,192 ;a += 8

add r8,192 ;b += 8

add rax,8 ;i += 8

cmp rax,r9

jb @B

mov eax,1 ;set success return code

Done: vzeroupper

ret

Avx512VcpAos_ endp

; extern "C" bool Avx512VcpSoa_(VectorSoA* c, const VectorSoA* a, const VectorSoA* b, size_t num_vectors);

Avx512VcpSoa_ proc frame

_CreateFrame CP2_,0,0,rbx,rsi,rdi,r12,r13,r14,r15

_EndProlog

; Make sure num_vec is valid

xor eax,eax

test r9,r9

jz Done ;jump if num_vec is zero

test r9,07h

jnz Done ;jump if num_vec % 8 != 0 is true

; Load vector array pointers and check for proper alignment

mov r10,[rdx] ;r10 = a.X

or rax,r10

mov r11,[rdx+8] ;r11 = a.Y

or rax,r11

mov r12,[rdx+16] ;r12 = a.Z

or rax,r12

mov r13,[r8] ;r13 = b.X

or rax,r13

mov r14,[r8+8] ;r14 = b.Y

or rax,r14

mov r15,[r8+16] ;r15 = b.Z

or rax,r15

mov rbx,[rcx] ;rbx = c.X

or rax,rbx

mov rsi,[rcx+8] ;rsi = c.Y

or rax,rsi

mov rdi,[rcx+16] ;rdi = c.Z

or rax,rdi

and rax,3fh ;misaligned component array?

mov eax,0 ;error return code (also i = 0)

jnz Done

; Load next block (8 vectors) from a and b

align 16

@@: vmovapd zmm0,zmmword ptr [r10+rax*8] ;zmm0 = a.X values

vmovapd zmm1,zmmword ptr [r11+rax*8] ;zmm1 = a.Y values

vmovapd zmm2,zmmword ptr [r12+rax*8] ;zmm2 = a.Z values

vmovapd zmm3,zmmword ptr [r13+rax*8] ;zmm3 = b.X values

vmovapd zmm4,zmmword ptr [r14+rax*8] ;zmm4 = b.Y values

vmovapd zmm5,zmmword ptr [r15+rax*8] ;zmm5 = b.Z values

; Calculate cross products

vmulpd zmm16,zmm1,zmm5

vmulpd zmm17,zmm2,zmm4

vsubpd zmm18,zmm16,zmm17 ;c.X = a.Y * b.Z - a.Z * b.Y

vmulpd zmm19,zmm2,zmm3

vmulpd zmm20,zmm0,zmm5

vsubpd zmm21,zmm19,zmm20 ;c.Y = a.Z * b.X - a.X * b.Z

vmulpd zmm22,zmm0,zmm4

vmulpd zmm23,zmm1,zmm3

vsubpd zmm24,zmm22,zmm23 ;c.Z = a.X * b.Y - a.Y * b.X

; Save calculated cross products

vmovapd zmmword ptr [rbx+rax*8],zmm18 ;save C.X values

vmovapd zmmword ptr [rsi+rax*8],zmm21 ;save C.Y values

vmovapd zmmword ptr [rdi+rax*8],zmm24 ;save C.Z values

add rax,8 ;i += 8

cmp rax,r9

jb @B ;repeat until done

Done: vzeroupper

_DeleteFrame rbx,rsi,rdi,r12,r13,r14,r15

ret

Avx512VcpSoa_ endp

end

Listing 13-7.

Example Ch13_07

The cross product of two three-dimensional vectors a and b is a third vector c that is perpendicular to both a and b. The x, y, and z components of c can be calculated using the following equations :

${c}_x={a}_y{b}_z-{a}_z{b}_ykern0.48em {c}_y={a}_z{b}_x-{a}_x{b}_zkern0.48em {c}_z={a}_x{b}_y-{a}_y{b}_x$

The C++ header file Ch13_07.h that’s shown in Listing 13-7 includes the structure definitions Vector and VectorSoA. The structure Vector contains three double-precision floating-point values—X, Y, and Z—that represent the components of a three-dimensional vector. The VectorSoA structure incorporates three pointers to double-precision floating-point arrays. Each array contains the values for a single vector component. Example Ch13_07 uses these structures to compare the performance of two different vector cross product calculating algorithms. The first algorithm performs its calculations using an array of structures (AOS) , while the second algorithm exploits a structure of arrays (SOA) .

The C++ function Avx512Vcp begins its execution by allocating storage space for sets of vector data structures. This function uses the C++ template class unique_ptr<Vector> to allocate storage for three AOSs. Note that each Vector object is not explicitly aligned on a 64-byte boundary since doing this would consume a considerable about of storage space that’s never used. Each unique_ptr<Vector> AOS is also representative of how this type of data construct is commonly employed in many real-world programs. Avx512Vcp uses the C++ template class AlignedArray<double> to allocate properly aligned storage space for the vector SOAs. Following data structure allocation, the function InitVec initializes both sets of vectors a and b using random values. It then invokes the assembly language vector cross product functions Avx512VcpAos_ and Avx512VcpSoa_.

Near the top of the assembly language file is a custom constant segment named ConstVals, which contains indices for the vgatherqpd and vscatterqpd instructions that are used in Avx512VcpAos_. The index values in this segment correspond to the memory ordering of Vector components X, Y, and Z in an array of Vector objects. Figure 13-4 illustrates this ordering in greater detail. Note that the indices defined in ConstVals enable the vgatherqpd and vscatterqpd instructions to load and save eight Vector objects.

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig4_HTML.jpg — Figure 13-4.
Memory ordering of components X, Y, and Z in an array of Vector objects

Following validation of num_vec, three vmovdqa64 (Move Aligned Packed Quadword Values) instructions load the gather /scatter indices for Vector components X, Y, and X into registers ZMM29, ZMM30, and ZMM31, respectively. The processing loop begins with a kxnorb k1,k1,k1 instruction that sets the low-order eight bits of opmask register K1 to one. The subsequent vgatherqpd zmm0{k1},[rdx+zmm29*8] instruction loads eight X component values from Vector a into register ZMM0. The vgatherqpd instruction loads eight values since the low-order eight bits of opmask register K1 are all set to one.

Five more sets of kxnorb and vgatherqpd instructions load the remaining Vector components into registers ZMM1–ZMM5. Note that during its execution, the vgatherqpd instruction sets the entire opmask register to zero unless an exception occurs due to an invalid memory access, which can be caused by an incorrect index or bad base register value. This updating of the opmask register introduces a potential register dependency that is eliminated by using a different opmask register for each vgatherqpd instruction. The next code block calculates eight vector cross products using basic packed double-precision floating-point arithmetic. The cross-product results are then saved to the destination Vector array c using three vscatterqpd instructions. Like the vgatherqpd instruction, the vscatterqpd instruction also sets its opmask register operand to zero unless an exception occurs.

The function Avx512VcpSoa_ begins its execution by validating num_vec. It then verifies that the nine vector component array pointers are properly aligned on a 64-byte boundary. The processing loop in Avx512VcpSoa_ employs straightforward packed double-precision floating-point arithmetic to calculate the vector cross products. Note that Avx512VcpSoa_ uses the aligned move instruction vmovapd to perform all vector component loads and stores. Here are the results for source code example Ch13_07:

Results for Avx512VectorCrossProd

Vector cross product #0

a: 96.0 30.0 52.0

b: 64.0 62.0 79.0

c_aos: -854.0 -4256.0 4032.0

c_soa: -854.0 -4256.0 4032.0

Vector cross product #1

a: 26.0 33.0 66.0

b: 89.0 36.0 20.0

c_aos: -1716.0 5354.0 -2001.0

c_soa: -1716.0 5354.0 -2001.0

Vector cross product #2

a: 56.0 60.0 53.0

b: 16.0 45.0 46.0

c_aos: 375.0 -1728.0 1560.0

c_soa: 375.0 -1728.0 1560.0

Vector cross product #3

a: 79.0 27.0 22.0

b: 18.0 75.0 45.0

c_aos: -435.0 -3159.0 5439.0

c_soa: -435.0 -3159.0 5439.0

Vector cross product #4

a: 77.0 30.0 46.0

b: 44.0 77.0 99.0

c_aos: -572.0 -5599.0 4609.0

c_soa: -572.0 -5599.0 4609.0

Vector cross product #5

a: 30.0 21.0 26.0

b: 43.0 61.0 47.0

c_aos: -599.0 -292.0 927.0

c_soa: -599.0 -292.0 927.0

Vector cross product #6

a: 58.0 56.0 46.0

b: 84.0 37.0 76.0

c_aos: 2554.0 -544.0 -2558.0

c_soa: 2554.0 -544.0 -2558.0

Vector cross product #7

a: 34.0 28.0 95.0

b: 20.0 51.0 36.0

c_aos: -3837.0 676.0 1174.0

c_soa: -3837.0 676.0 1174.0

Vector cross product #8

a: 34.0 50.0 35.0

b: 48.0 1.0 24.0

c_aos: 1165.0 864.0 -2366.0

c_soa: 1165.0 864.0 -2366.0

Vector cross product #9

a: 28.0 12.0 46.0

b: 6.0 53.0 77.0

c_aos: -1514.0 -1880.0 1412.0

c_soa: -1514.0 -1880.0 1412.0

Vector cross product #10

a: 43.0 78.0 86.0

b: 12.0 61.0 97.0

c_aos: 2320.0 -3139.0 1687.0

c_soa: 2320.0 -3139.0 1687.0

Vector cross product #11

a: 53.0 78.0 85.0

b: 78.0 34.0 65.0

c_aos: 2180.0 3185.0 -4282.0

c_soa: 2180.0 3185.0 -4282.0

Vector cross product #12

a: 9.0 66.0 2.0

b: 54.0 45.0 55.0

c_aos: 3540.0 -387.0 -3159.0

c_soa: 3540.0 -387.0 -3159.0

Vector cross product #13

a: 15.0 59.0 35.0

b: 94.0 67.0 22.0

c_aos: -1047.0 2960.0 -4541.0

c_soa: -1047.0 2960.0 -4541.0

Vector cross product #14

a: 95.0 20.0 24.0

b: 45.0 85.0 55.0

c_aos: -940.0 -4145.0 7175.0

c_soa: -940.0 -4145.0 7175.0

Vector cross product #15

a: 76.0 77.0 15.0

b: 29.0 95.0 23.0

c_aos: 346.0 -1313.0 4987.0

c_soa: 346.0 -1313.0 4987.0

Running benchmark function Avx512VectorCrossProd_BM - please wait

Benchmark times save to file Ch13_07_Avx512VectorCrossProd_BM_CHROMIUM.csv

Table 13-1 shows benchmark timing measurements for the two cross product calculating functions. This table uses dashes to signify processors that do not support AVX-512. For source code example Ch13_07, the SOA technique is somewhat faster than the AOS method.

Table 13-1.

Benchmark Timing Measurements for Vector Cross Product Calculating Functions (1,000,000 Cross Products)

CPU	Avx512VcpAos_	Avx512VcpSoa_
i7-4790S	----	----
i9-7900X	4734	4141
i7-8700K	----	----

Matrix-Vector Multiplication

Many computer graphics and image processing algorithms perform matrix-vector multiplications using 4 × 4 matrices and 4 × 1 vectors. In 3D computer graphics software, these types of calculations are universally employed to perform affine transformations (e.g., translation, rotation, and scaling) using homogeneous coordinates. Figure 13-5 shows the equations that can be used to multiply a 4 × 4 matrix by a 4 × 1 vector. Note that the components of vector b are a simple sum-of-products calculation of the matrix’s columns and the individual components of vector a. Figure 13-5 also shows a sample matrix-vector multiplication calculation using real numbers.

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig5_HTML.jpg — Figure 13-5.
Equations for matrix-vector multiplication and a sample calculation

Listing 13-8 shows the source code for example Ch13_08. This example demonstrates how to multiply a single 4 × 4 matrix with a set of 4 × 1 vectors that are stored in an array.

//------------------------------------------------

// Ch13_08.h

//------------------------------------------------

#pragma once

// Simple 4x1 vector structure

struct Vec4x1_F32

{

float W, X, Y, Z;

};

// Ch13_08.cpp

extern void InitVecArray(Vec4x1_F32* va, size_t num_vec);

extern bool Avx512MatVecMulF32Cpp(Vec4x1_F32* vec_b, float mat[4][4], Vec4x1_F32* vec_a, size_t num_vec);

// Ch13_08_.asm

extern "C" bool Avx512MatVecMulF32_(Vec4x1_F32* vec_b, float mat[4][4], Vec4x1_F32* vec_a, size_t num_vec);

// Ch13_08_BM.cpp

extern void Avx512MatVecMulF32_BM(void);

//------------------------------------------------

// Ch13_08.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#include <random>

#include <cmath>

#include "Ch13_08.h"

#include "AlignedMem.h"

using namespace std;

bool VecCompare(const Vec4x1_F32* v1, const Vec4x1_F32* v2)

{

static const float eps = 1.0e-12f;

bool b0 = (fabs(v1->W - v2->W) <= eps);

bool b1 = (fabs(v1->X - v2->X) <= eps);

bool b2 = (fabs(v1->Y - v2->Y) <= eps);

bool b3 = (fabs(v1->Z - v2->Z) <= eps);

return b0 && b1 && b2 && b3;

}

void InitVecArray(Vec4x1_F32* va, size_t num_vec)

{

uniform_int_distribution<> ui_dist {1, 500};

default_random_engine rng {187};

for (size_t i = 0; i < num_vec; i++)

{

va[i].W = (float)ui_dist(rng);

va[i].X = (float)ui_dist(rng);

va[i].Y = (float)ui_dist(rng);

va[i].Z = (float)ui_dist(rng);

}

if (num_vec >= 4)

{

// Test values

va[0].W = 5; va[0].X = 6; va[0].Y = 7; va[0].Z = 8;

va[1].W = 15; va[1].X = 16; va[1].Y = 17; va[1].Z = 18;

va[2].W = 25; va[2].X = 26; va[2].Y = 27; va[2].Z = 28;

va[3].W = 35; va[3].X = 36; va[3].Y = 37; va[3].Z = 38;

}

bool Avx512MatVecMulF32Cpp(Vec4x1_F32* vec_b, float mat[4][4], Vec4x1_F32* vec_a, size_t num_vec)

{

if (num_vec == 0 || num_vec % 4 != 0)

return false;

if (!AlignedMem::IsAligned(vec_a, 64) || !AlignedMem::IsAligned(vec_b, 64))

return false;

if (!AlignedMem::IsAligned(mat, 64))

return false;

for (size_t i = 0; i < num_vec; i++)

{

vec_b[i].W = mat[0][0] * vec_a[i].W + mat[0][1] * vec_a[i].X;

vec_b[i].W += mat[0][2] * vec_a[i].Y + mat[0][3] * vec_a[i].Z;

vec_b[i].X = mat[1][0] * vec_a[i].W + mat[1][1] * vec_a[i].X;

vec_b[i].X += mat[1][2] * vec_a[i].Y + mat[1][3] * vec_a[i].Z;

vec_b[i].Y = mat[2][0] * vec_a[i].W + mat[2][1] * vec_a[i].X;

vec_b[i].Y += mat[2][2] * vec_a[i].Y + mat[2][3] * vec_a[i].Z;

vec_b[i].Z = mat[3][0] * vec_a[i].W + mat[3][1] * vec_a[i].X;

vec_b[i].Z += mat[3][2] * vec_a[i].Y + mat[3][3] * vec_a[i].Z;

}

return true;

}

void Avx512MatVecMulF32(void)

{

const size_t num_vec = 8;

alignas(64) float mat[4][4]

{

10.0, 11.0, 12.0, 13.0,

20.0, 21.0, 22.0, 23.0,

30.0, 31.0, 32.0, 33.0,

40.0, 41.0, 42.0, 43.0

};

AlignedArray<Vec4x1_F32> vec_a_aa(num_vec, 64);

AlignedArray<Vec4x1_F32> vec_b1_aa(num_vec, 64);

AlignedArray<Vec4x1_F32> vec_b2_aa(num_vec, 64);

Vec4x1_F32* vec_a = vec_a_aa.Data();

Vec4x1_F32* vec_b1 = vec_b1_aa.Data();

Vec4x1_F32* vec_b2 = vec_b2_aa.Data();

InitVecArray(vec_a, num_vec);

bool rc1 = Avx512MatVecMulF32Cpp(vec_b1, mat, vec_a, num_vec);

bool rc2 = Avx512MatVecMulF32_(vec_b2, mat, vec_a, num_vec);

cout << "Results for Avx512MatVecMulF32 ";

if (!rc1 || !rc2)

{

cout << "Invalid return code ";

cout << " rc1 = " << boolalpha << rc1 << ' ';

cout << " rc2 = " << boolalpha << rc2 << ' ';

return;

}

const unsigned int w = 8;

cout << fixed << setprecision(1);

for (size_t i = 0; i < num_vec; i++)

{

cout << "Test case #" << i << ' ';

cout << "vec_b1: ";

cout << " " << setw(w) << vec_b1[i].W << ' ';

cout << " " << setw(w) << vec_b1[i].X << ' ';

cout << " " << setw(w) << vec_b1[i].Y << ' ';

cout << " " << setw(w) << vec_b1[i].Z << ' ';

cout << "vec_b2: ";

cout << " " << setw(w) << vec_b2[i].W << ' ';

cout << " " << setw(w) << vec_b2[i].X << ' ';

cout << " " << setw(w) << vec_b2[i].Y << ' ';

cout << " " << setw(w) << vec_b2[i].Z << ' ';

if (!VecCompare(&vec_b1[i], &vec_b2[i]))

{

cout << "Error - vector compare failed ";

return;

}

int main()

{

Avx512MatVecMulF32();

Avx512MatVecMulF32_BM();

return 0;

}

;-------------------------------------------------

; Ch13_08.asm

;-------------------------------------------------

ConstVals segment readonly align(64) 'const'

; Indices for matrix permutations

MatPerm0 dword 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

MatPerm1 dword 1, 5, 9, 13, 1, 5, 9, 13, 1, 5, 9, 13, 1, 5, 9, 13

MatPerm2 dword 2, 6, 10, 14, 2, 6, 10, 14, 2, 6, 10, 14, 2, 6, 10, 14

MatPerm3 dword 3, 7, 11, 15, 3, 7, 11, 15, 3, 7, 11, 15, 3, 7, 11, 15

; Indices for vector permutations

VecPerm0 dword 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12

VecPerm1 dword 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13

VecPerm2 dword 2, 2, 2, 2, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14, 14, 14

VecPerm3 dword 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15

ConstVals ends

; extern "C" bool Avx512MatVecMulF32_(Vec4x1_F32* vec_b, float mat[4][4], Vec4x1_F32* vec_a, size_t num_vec);

.code

Avx512MatVecMulF32_ proc

xor eax,eax ;set error code (also i = 0)

test r9,r9

jz Done ;jump if num_vec is zero

test r9,3

jnz Done ;jump if n % 4 != 0

test rcx,3fh

jnz Done ;jump if vec_b is not properly aligned

test rdx,3fh

jnz Done ;jump if mat is not properly aligned

test r8,3fh

jnz Done ;jump if vec_a is not properly aligned

; Load permutation indices for matrix columns and vector elements

vmovdqa32 zmm16,zmmword ptr [MatPerm0] ;mat col 0 indices

vmovdqa32 zmm17,zmmword ptr [MatPerm1] ;mat col 1 indices

vmovdqa32 zmm18,zmmword ptr [MatPerm2] ;mat col 2 indices

vmovdqa32 zmm19,zmmword ptr [MatPerm3] ;mat col 3 indices

vmovdqa32 zmm24,zmmword ptr [VecPerm0] ;W component indices

vmovdqa32 zmm25,zmmword ptr [VecPerm1] ;X component indices

vmovdqa32 zmm26,zmmword ptr [VecPerm2] ;Y component indices

vmovdqa32 zmm27,zmmword ptr [VecPerm3] ;Z component indices

; Load source matrix and duplicate columns

vmovaps zmm0,zmmword ptr [rdx] ;zmm0 = mat

vpermps zmm20,zmm16,zmm0 ;zmm20 = mat col 0 (4x)

vpermps zmm21,zmm17,zmm0 ;zmm21 = mat col 1 (4x)

vpermps zmm22,zmm18,zmm0 ;zmm22 = mat col 2 (4x)

vpermps zmm23,zmm19,zmm0 ;zmm23 = mat col 3 (4x)

; Load the next 4 vectors

align 16

@@: vmovaps zmm4,zmmword ptr [r8+rax] ;zmm4 = vec_a (4 vectors)

; Permute the vector elements for subsequent calculations

vpermps zmm0,zmm24,zmm4 ;zmm0 = vec_a W components

vpermps zmm1,zmm25,zmm4 ;zmm1 = vec_a X components

vpermps zmm2,zmm26,zmm4 ;zmm2 = vec_a Y components

vpermps zmm3,zmm27,zmm4 ;zmm3 = vec_a Z components

; Perform matrix-vector multiplications (4 vectors)

vmulps zmm28,zmm20,zmm0

vmulps zmm29,zmm21,zmm1

vmulps zmm30,zmm22,zmm2

vmulps zmm31,zmm23,zmm3

vaddps zmm4,zmm28,zmm29

vaddps zmm5,zmm30,zmm31

vaddps zmm4,zmm4,zmm5 ;zmm4 = vec_b (4 vectors)

vmovaps zmmword ptr [rcx+rax],zmm4 ;save result

add rax,64 ;rax = offset to next block of 4 vectors

sub r9,4

jnz @B ;repeat until done

mov eax,1 ;set success code

Done: vzeroupper

ret

Avx512MatVecMulF32_ endp

end

Listing 13-8.

Example Ch13_08

The C++ code in Listing 13-8 begins with header file Ch13_08.h that contains the requisite function declarations. This file also includes a declaration for the structure Vec4x1_F32, which incorporates the four components of a 4 × 1 column vector. The source code file Ch13_08.cpp includes a function named Avx512MatVecMulF32Cpp. This function implements the matrix-vector multiplication equations that are shown in Figure 13-5. The remaining C++ code in Listing 13-8 performs test case initializations, invokes the calculating functions, and displays the results.

The assembly language code in Listing 13-8 starts with a constant data segment that defines a series of packed permutation indices. The assembly language implementation of the matrix-vector multiplication algorithm uses these values to reorder the elements of the source matrix and vectors. The reason for this reordering is to facilitate the simultaneous calculation of four matrix-vector products. The function Avx512MatVecMulF32_ begins its execution by validating num_vec for divisibility by four. It then checks the matrix and vector buffer pointers for proper alignment on a 64-byte boundary.

Following argument validation, four vmovdqa32 instructions load the matrix permutation indices into registers ZMM16–ZMM19. This is followed by another series of four vmovdqa32 instructions that load the vector permutation indices into registers ZMM24–ZMM27. The ensuing vmovaps zmm0, zmmword ptr [rdx] instruction loads all 16 single-precision floating-point elements of matrix mat into ZMM0. The vpermps zmm20,zmm16,zmm0 instruction (Permute Single-Precision Floating-Point Elements) rearranges the elements in ZMM0 according to the indices in ZMM16. Execution of this instruction loads four copies of column 0 from matrix mat into register ZMM20. Three more vpermps instructions are then employed to perform the same operation using columns 1, 2, and 3. Figure 13-6 elucidates the execution of these permutations in greater detail.

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig6_HTML.jpg — Figure 13-6.
Permutation of matrix columns using *vpermps* instructions

The processing loop in Avx512MatVecMulF32_ starts with a vmovaps zmm4,zmmword ptr [r8+rax] instruction that loads four Vec4x1_F32 vectors into register ZMM4. The W, X, Y, and Z components of these vectors are then regrouped using another series of vpermps instructions. Following execution of these instructions, registers ZMM0–ZMM3 contain repeated sets of the vector components, as shown in Figure 13-7.

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig7_HTML.jpg — Figure 13-7.
Permutation of vector components using *vpermps* instructions

Following the vector component permutations, a series of vmulps and vaddps instructions carry out four simultaneous matrix-vector multiplications. Figure 13-8 illustrates this operation in greater detail. The ensuing vmovaps zmmword ptr [rcx+rax],zmm4 instruction saves the four resultant 4 × 1 vectors in the vec_b array. The processing loop then repeats until all vectors in vec_a have been processed.

../images/326959_2_En_13_Chapter/326959_2_En_13_Fig8_HTML.jpg — Figure 13-8.
Matrix-vector multiplications using *vmulps* and *vaddps*

The output for source code example Ch13_08 follows this paragraph. Table 13-2 shows benchmark timing measurements for the C++ and assembly language matrix-vector multiplication functions.

Results for Avx512MatVecMulF32

Test case #0

vec_b1: 304.0 564.0 824.0 1084.0

vec_b2: 304.0 564.0 824.0 1084.0

Test case #1

vec_b1: 764.0 1424.0 2084.0 2744.0

vec_b2: 764.0 1424.0 2084.0 2744.0

Test case #2

vec_b1: 1224.0 2284.0 3344.0 4404.0

vec_b2: 1224.0 2284.0 3344.0 4404.0

Test case #3

vec_b1: 1684.0 3144.0 4604.0 6064.0

vec_b2: 1684.0 3144.0 4604.0 6064.0

Test case #4

vec_b1: 11932.0 22452.0 32972.0 43492.0

vec_b2: 11932.0 22452.0 32972.0 43492.0

Test case #5

vec_b1: 17125.0 31705.0 46285.0 60865.0

vec_b2: 17125.0 31705.0 46285.0 60865.0

Test case #6

vec_b1: 12723.0 23873.0 35023.0 46173.0

vec_b2: 12723.0 23873.0 35023.0 46173.0

Test case #7

vec_b1: 15121.0 27871.0 40621.0 53371.0

vec_b2: 15121.0 27871.0 40621.0 53371.0

Running benchmark function Avx512MatVecMulF32_BM - please wait

Benchmark times save to file Ch13_08_Avx512MatVecMulF32_BM_CHROMIUM.csv

Table 13-2.

Benchmark Timing Measurements for Matrix-Vector Multiplication Functions (1,000,000 Vectors)

CPU	Avx512MatVecMulF32Cpp	Avx512MatVecMulF32_
i7-4790S	----	----
i9-7900X	6174	1778
i7-8700K	----	----

Convolutions

Listing 13-9 shows the source code for example Ch13_09. This example is an AVX-512 implementation of the convolution program that was presented in source code example Ch11_02. The primary purpose of this example is to highlight the conversion of functions that use AVX2 instructions to ones that exploit AVX-512 instructions. It also provides an opportunity to compare benchmark timing measurements between the AVX2 and AVX-512 implementations of the convolution functions.

;-------------------------------------------------

; Ch13_09_.asm

;-------------------------------------------------

include <MacrosX86-64-AVX.asmh>

extern c_NumPtsMin:dword

extern c_NumPtsMax:dword

extern c_KernelSizeMin:dword

extern c_KernelSizeMax:dword

; extern bool Avx512Convolve2_(float* y, const float* x, int num_pts, const float* kernel, int kernel_size)

.code

Avx512Convolve2_ proc frame

_CreateFrame CV2_,0,0,rbx

_EndProlog

; Validate argument values

xor eax,eax ;set error code

mov r10d,dword ptr [rbp+CV2_OffsetStackArgs]

test r10d,1

jz Done ;kernel_size is even

cmp r10d,[c_KernelSizeMin]

jl Done ;kernel_size too small

cmp r10d,[c_KernelSizeMax]

jg Done ;kernel_size too big

cmp r8d,[c_NumPtsMin]

jl Done ;num_pts too small

cmp r8d,[c_NumPtsMax]

jg Done ;num_pts too big

test r8d,15

jnz Done ;num_pts not even multiple of 16

test rcx,3fh

jnz Done ;y is not properly aligned

; Initialize convolution loop variables

shr r10d,1 ;r10 = kernel_size / 2 (ks2)

lea rdx,[rdx+r10*4] ;rdx = x + ks2 (first data point)

xor ebx,ebx ;i = 0

; Perform convolution

LP1: vxorps zmm0,zmm0,zmm0 ;packed sum = 0.0;

mov r11,r10 ;r11 = ks2

neg r11 ;k = -ks2

LP2: mov rax,rbx ;rax = i

sub rax,r11 ;rax = i - k

vmovups zmm1,zmmword ptr [rdx+rax*4] ;load x[i - k]:x[i - k + 15]

mov rax,r11

add rax,r10 ;rax = k + ks2

vbroadcastss zmm2,real4 ptr [r9+rax*4] ;zmm2 = kernel[k + ks2]

vfmadd231ps zmm0,zmm1,zmm2 ;zmm0 += x[i-k]:x[i-k+15] * kernel[k+ks2]

add r11,1 ;k += 1

cmp r11,r10

jle LP2 ;repeat until k > ks2

vmovaps zmmword ptr [rcx+rbx*4],zmm0 ;save y[i]:y[i + 15]

add rbx,16 ;i += 16

cmp rbx,r8

jl LP1 ;repeat until done

mov eax,1 ;set success return code

Done: vzeroupper

_DeleteFrame rbx

ret

Avx512Convolve2_ endp

; extern bool Avx512Convolve2Ks5_(float* y, const float* x, int num_pts, const float* kernel, int kernel_size)

Avx512Convolve2Ks5_ proc frame

_CreateFrame CKS5_,0,48

_SaveXmmRegs xmm6,xmm7,xmm8

_EndProlog

; Validate argument values

xor eax,eax ;set error code (rax is also loop index var)

cmp dword ptr [rbp+CKS5_OffsetStackArgs],5

jne Done ;jump if kernel_size is not 5

cmp r8d,[c_NumPtsMin]

jl Done ;jump if num_pts too small

cmp r8d,[c_NumPtsMax]

jg Done ;jump if num_pts too big

test r8d,15

jnz Done ;num_pts not even multiple of 15

test rcx,3fh

jnz Done ;y is not properly aligned

; Perform required initializations

vbroadcastss zmm4,real4 ptr [r9] ;kernel[0]

vbroadcastss zmm5,real4 ptr [r9+4] ;kernel[1]

vbroadcastss zmm6,real4 ptr [r9+8] ;kernel[2]

vbroadcastss zmm7,real4 ptr [r9+12] ;kernel[3]

vbroadcastss zmm8,real4 ptr [r9+16] ;kernel[4]

mov r8d,r8d ;r8 = num_pts

add rdx,8 ;x += 2

; Perform convolution

@@: vxorps zmm2,zmm2,zmm2 ;initialize sum vars

vxorps zmm3,zmm3,zmm3

mov r11,rax

add r11,2 ;j = i + ks2

vmovups zmm0,zmmword ptr [rdx+r11*4] ;zmm0 = x[j]:x[j + 15]

vfmadd231ps zmm2,zmm0,zmm4 ;zmm2 += x[j]:x[j + 15] * kernel[0]

vmovups zmm1,zmmword ptr [rdx+r11*4-4] ;zmm1 = x[j - 1]:x[j + 14]

vfmadd231ps zmm3,zmm1,zmm5 ;zmm3 += x[j - 1]:x[j + 14] * kernel[1]

vmovups zmm0,zmmword ptr [rdx+r11*4-8] ;zmm0 = x[j - 2]:x[j + 13]

vfmadd231ps zmm2,zmm0,zmm6 ;zmm2 += x[j - 2]:x[j + 13] * kernel[2]

vmovups zmm1,zmmword ptr [rdx+r11*4-12] ;zmm1 = x[j - 3]:x[j + 12]

vfmadd231ps zmm3,zmm1,zmm7 ;zmm3 += x[j - 3]:x[j + 12] * kernel[3]

vmovups zmm0,zmmword ptr [rdx+r11*4-16] ;zmm0 = x[j - 4]:x[j + 11]

vfmadd231ps zmm2,zmm0,zmm8 ;zmm2 += x[j - 4]:x[j + 11] * kernel[4]

vaddps zmm0,zmm2,zmm3 ;final values

vmovaps zmmword ptr [rcx+rax*4],zmm0 ;save y[i]:y[i + 15]

add rax,16 ;i += 16

cmp rax,r8

jl @B ;jump if i < num_pts

mov eax,1 ;set success return code

Done: vzeroupper

_RestoreXmmRegs xmm6,xmm7,xmm8

_DeleteFrame

ret

Avx512Convolve2Ks5_ endp

end

Listing 13-9.

Example Ch13_09

The C++ portion of source code example Ch13_09 is not shown in Listing 13-9 since it’s almost identical to the C++ code in example Ch11_02. Modifications made in the Ch13_09 C++ code include a few function name changes. The test arrays are also allocated on a 64-byte instead of a 32-byte boundary.

The assembly language function Avx512Convolve2_ implements the variable-size kernel convolution algorithm that’s described in Chapter 11. The primary difference between this function and its AVX2 counterpart Convolve2_ (see Listing 11-2) is the use of ZMM registers instead of YMM registers . The code that adjusts the index counter in register RBX was also modified to reflect the processing of 16 data points per iteration instead of 8. Similar changes were also made to the fixed-size kernel convolution function Avx512Convolve2Ks5_.

The output for source code example Ch13_09 is not shown since it’s the same as the output for source code example Ch11_02. Table 13-3 shows the benchmark timing measurements for functions Avx512Convolve2_ and Avx512Convolve2Ks5_. This table also includes the benchmark timing measurements for the AVX2 functions Convolve2_ and ConvolveKs2_ from Table 11-2. The AVX-512 implementations are faster than their AVX2 counterparts, especially for the size-independent convolution function Avx512Convolve2_. It would, of course, be imprudent to extrapolate any general conclusions regarding AVX-512 versus AVX2 performance based solely on the timing measurements shown in Table 13-3. You’ll see other examples in Chapter 14.

Table 13-3.

Mean Execution Times (Microseconds) for AVX2 and AVX-512 Convolution Functions Using Five-Element Convolution Kernel (2,000,000 Signal Points)

CPU	Convolve2_	Avx512Convolve2_	Convolve2Ks5_	Avx512Convolve2Ks5_
i7-4790S	1244	-----	1067	----
i9-7900X	956	757	719	693
i7-8700K	859	-----	595	----

Summary

Here the key learning points for Chapter 13.

When using merge masking with scalar or packed operands, the processor carries out the instruction’s calculation only if the corresponding opmask register bit is set to one. Otherwise, no calculation is performed and the destination operand element remains unchanged.
AVX-512 assembly language functions can use an opmask register destination operand with most instructions that perform scalar or packed compare operations. The bits of the opmask register can then be employed to effect data-driven logic decisions sans any conditional jump instructions using either merge or zero masking and (if necessary) simple Boolean operations.
AVX-512 assembly language functions must use the vmovdqu[32|64] and vmovdqa[32|64] instructions to perform move operations using 512-bit wide packed doubleword and quadword integer operands. These instructions can also be used with 256-bit and 128-bit wide operands.
Unlike AVX and AVX2, AVX-512 includes instructions that perform conversions between floating-point and unsigned integer operands.
AVX-512 functions should ensure that packed 128-, 256-, and 512-bit wide operands are aligned on a proper boundary whenever possible.
Assembly language functions that use AVX-512 instructions with registers ZMM0–ZMM15 or YMM0–YMM15 register operands should always use a vzeroupper instruction before program control is transferred back to the calling function.
Assembly language functions and algorithms that employ a structure of arrays are often faster than those that use an array of structures.
The Visual C++ calling convention treats AVX-512 registers ZMM16–ZMM31, YMM16–YMM31, and XMM16–XMM31 as volatile across function boundaries. This means that a function can use these registers without needing to preserve their values.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.

Table of Contents for 13. AVX-512 Programming – Floating-Point

Create new playlist

Sign In

Sign Up

13. AVX-512 Programming – Floating-Point

Scalar Floating-Point

Merge Masking

Zero Masking

Instruction-Level Rounding

Packed Floating-Point

Packed Floating-Point Arithmetic

Packed Floating-Point Compares

Packed Floating-Point Column Means

Vector Cross Products

Matrix-Vector Multiplication

Convolutions

Summary

Table of Contents for
13. AVX-512 Programming – Floating-Point