Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

Daniel KusswurmModern X86 Assembly Language Programminghttps://doi.org/10.1007/978-1-4842-4063-2_14

14. AVX-512 Programming – Packed Integers

Daniel Kusswurm¹

(1)

Geneva, IL, USA

In Chapters 7 and 10, you learned how to use the AVX and AVX2 instruction sets to perform packed integer operations using 128-bit and 256-bit wide operands. In this chapter, you learn how to use AVX-512 instructions set to carry out packed integer operations using 512-bit wide operands. You also learn how to use AVX-512 instructions with 256-bit and 128-bit wide packed integer operands. The first source code example explains how to perform basic packed integer arithmetic using ZMM registers. This is followed by several examples that exemplify image-processing algorithms and techniques using AVX-512 instructions. Like the previous chapter, all of source code examples in this chapter require a processor and operating system that support AVX-512 and the following instruction set extensions: AVX512F , AVX512CD , AVX512BW , AVX512DQ , and AVX512VL . You can use one of the freely available utilities listed in Appendix A to determine whether your system supports these extensions.

Basic Arithmetic

Listing 14-1 shows the source code for example Ch14_01. This example demonstrates how to perform basic packed integer arithmetic using 512-bit wide operands and the ZMM register set.

//------------------------------------------------

// Ch14_01.cpp

//------------------------------------------------

#include "stdafx.h"

#include <cstdint>

#include <iostream>

#include <iomanip>

#include "Zmmval.h"

using namespace std;

extern "C" void Avx512PackedMathI16_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[6]);

extern "C" void Avx512PackedMathI64_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[5], uint32_t opmask);

void Avx512PackedMathI16(void)

{

alignas(64) ZmmVal a;

alignas(64) ZmmVal b;

alignas(64) ZmmVal c[6];

a.m_I16[0] = 10; b.m_I16[0] = 100;

a.m_I16[1] = 20; b.m_I16[1] = 200;

a.m_I16[2] = 30; b.m_I16[2] = 300;

a.m_I16[3] = 40; b.m_I16[3] = 400;

a.m_I16[4] = 50; b.m_I16[4] = 500;

a.m_I16[5] = 60; b.m_I16[5] = 600;

a.m_I16[6] = 70; b.m_I16[6] = 700;

a.m_I16[7] = 80; b.m_I16[7] = 800;

a.m_I16[8] = 1000; b.m_I16[8] = -100;

a.m_I16[9] = 2000; b.m_I16[9] = 200;

a.m_I16[10] = 3000; b.m_I16[10] = -300;

a.m_I16[11] = 4000; b.m_I16[11] = 400;

a.m_I16[12] = 5000; b.m_I16[12] = -500;

a.m_I16[13] = 6000; b.m_I16[13] = 600;

a.m_I16[14] = 7000; b.m_I16[14] = -700;

a.m_I16[15] = 8000; b.m_I16[15] = 800;

a.m_I16[16] = -1000; b.m_I16[16] = 100;

a.m_I16[17] = -2000; b.m_I16[17] = -200;

a.m_I16[18] = 3000; b.m_I16[18] = 303;

a.m_I16[19] = 4000; b.m_I16[19] = -400;

a.m_I16[20] = -5000; b.m_I16[20] = 500;

a.m_I16[21] = -6000; b.m_I16[21] = -600;

a.m_I16[22] = -7000; b.m_I16[22] = 700;

a.m_I16[23] = -8000; b.m_I16[23] = 800;

a.m_I16[24] = 30000; b.m_I16[24] = 3000; // add overflow

a.m_I16[25] = 6000; b.m_I16[25] = 32000; // add overflow

a.m_I16[26] = -25000; b.m_I16[26] = -27000; // add overflow

a.m_I16[27] = 8000; b.m_I16[27] = 28700; // add overflow

a.m_I16[28] = 2000; b.m_I16[28] = -31000; // sub overflow

a.m_I16[29] = 4000; b.m_I16[29] = -30000; // sub overflow

a.m_I16[30] = -3000; b.m_I16[30] = 32000; // sub overflow

a.m_I16[31] = -15000; b.m_I16[31] = 24000; // sub overflow

Avx512PackedMathI16_(&a, &b, c);

cout <<" Results for Avx512PackedMathI16 ";

cout << " i a b vpaddw vpaddsw vpsubw vpsubsw vpminsw vpmaxsw ";

cout << "-------------------------------------------------------------------------- ";

for (int i = 0; i < 32; i++)

{

cout << setw(2) << i << ' ';

cout << setw(8) << a.m_I16[i] << ' ';

cout << setw(8) << b.m_I16[i] << ' ';

cout << setw(8) << c[0].m_I16[i] << ' ';

cout << setw(8) << c[1].m_I16[i] << ' ';

cout << setw(8) << c[2].m_I16[i] << ' ';

cout << setw(8) << c[3].m_I16[i] << ' ';

cout << setw(8) << c[4].m_I16[i] << ' ';

cout << setw(8) << c[5].m_I16[i] << ' ';

}

void Avx512PackedMathI64(void)

{

alignas(64) ZmmVal a;

alignas(64) ZmmVal b;

alignas(64) ZmmVal c[6];

uint32_t opmask = 0x7f;

a.m_I64[0] = 64; b.m_I64[0] = 4;

a.m_I64[1] = 1024; b.m_I64[1] = 5;

a.m_I64[2] = -2048; b.m_I64[2] = 2;

a.m_I64[3] = 8192; b.m_I64[3] = 5;

a.m_I64[4] = -256; b.m_I64[4] = 8;

a.m_I64[5] = 4096; b.m_I64[5] = 7;

a.m_I64[6] = 16; b.m_I64[6] = 3;

a.m_I64[7] = 512; b.m_I64[7] = 6;

Avx512PackedMathI64_(&a, &b, c, opmask);

cout << " Results for Avx512PackedMathI64 ";

cout << "op_mask = " << hex << opmask << dec << ' ';

cout << " i a b vpaddq vpsubq vpmullq vpsllvq vpsravq vpabsq ";

cout << "---------------------------------------------------------------------- ";

for (int i = 0; i < 8; i++)

{

cout << setw(2) << i << ' ';

cout << setw(6) << a.m_I64[i] << ' ';

cout << setw(6) << b.m_I64[i] << ' ';

cout << setw(8) << c[0].m_I64[i] << ' ';

cout << setw(8) << c[1].m_I64[i] << ' ';

cout << setw(8) << c[2].m_I64[i] << ' ';

cout << setw(8) << c[3].m_I64[i] << ' ';

cout << setw(8) << c[4].m_I64[i] << ' ';

cout << setw(8) << c[5].m_I64[i] << ' ';

}

int main()

{

Avx512PackedMathI16();

Avx512PackedMathI64();

return 0;

}

;-------------------------------------------------

; Ch14_01.asm

;-------------------------------------------------

; extern "C" void Avx512PackedMathI16_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[6])

.code

Avx512PackedMathI16_ proc

vmovdqu16 zmm0,zmmword ptr [rcx] ;zmm0 = a

vmovdqu16 zmm1,zmmword ptr [rdx] ;zmm1 = b

; Perform packed word operations

vpaddw zmm2,zmm0,zmm1 ;add

vmovdqa64 zmmword ptr [r8],zmm2 ;save vpaddw result

vpaddsw zmm2,zmm0,zmm1 ;add with signed saturation

vmovdqa64 zmmword ptr [r8+64],zmm2 ;save vpaddsw result

vpsubw zmm2,zmm0,zmm1 ;sub

vmovdqa64 zmmword ptr [r8+128],zmm2 ;save vpsubw result

vpsubsw zmm2,zmm0,zmm1 ;sub with signed saturation

vmovdqa64 zmmword ptr [r8+192],zmm2 ;save vpsubsw result

vpminsw zmm2,zmm0,zmm1 ;signed minimums

vmovdqa64 zmmword ptr [r8+256],zmm2 ;save vpminsw result

vpmaxsw zmm2,zmm0,zmm1 ;signed maximums

vmovdqa64 zmmword ptr [r8+320],zmm2 ;save vpmaxsw result

vzeroupper

ret

Avx512PackedMathI16_ endp

; extern "C" void Avx512PackedMathI64_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[5], unsigned int opmask)

Avx512PackedMathI64_ proc

vmovdqa64 zmm0,zmmword ptr [rcx] ;zmm0 = a

vmovdqa64 zmm1,zmmword ptr [rdx] ;zmm1 = b

and r9d,0ffh ;r9d = opmask value

kmovb k1,r9d ;k1 = opmask

; Perform packed quadword operations

vpaddq zmm2{k1}{z},zmm0,zmm1 ;add

vmovdqa64 zmmword ptr [r8],zmm2 ;save vpaddq result

vpsubq zmm2{k1}{z},zmm0,zmm1 ;sub

vmovdqa64 zmmword ptr [r8+64],zmm2 ;save vpsubq result

vpmullq zmm2{k1}{z},zmm0,zmm1 ;signed mul (low 64 bits)

vmovdqa64 zmmword ptr [r8+128],zmm2 ;save vpmullq result

vpsllvq zmm2{k1}{z},zmm0,zmm1 ;shift left logical

vmovdqa64 zmmword ptr [r8+192],zmm2 ;save vpsllvq result

vpsravq zmm2{k1}{z},zmm0,zmm1 ;shift right arithmetic

vmovdqa64 zmmword ptr [r8+256],zmm2 ;save vpsravq result

vpabsq zmm2{k1}{z},zmm0 ;absolute value

vmovdqa64 zmmword ptr [r8+320],zmm2 ;save vpabsq result

vzeroupper

ret

Avx512PackedMathI64_ endp

end

Listing 14-1.

Example Ch14_01

The C++ functions Avx512PackedMathI16 and Avx512PackedMathI64 are the base routines that handle AVX-512 packed integer operations using word and quadword values. Each function begins its execution by initializing the applicable integer elements of two ZmmVal variables. Note that the C++ alignas(64) specifier is used with each ZmmVal. Following variable initialization, each base routine invokes its corresponding assembly language function: Avx512PackedMathI16_ or Avx512PackedMathI64_. The results are then streamed to cout.

The assembly language function Avx512PackedMathI16_ starts its execution with two vmovdqa64 instructions that load ZmmVal variables a and b into registers ZMM0 and ZMM1, respectively. Somewhat surprisingly, AVX512BW does not include aligned move instructions for 512-bit wide packed byte and word operands. Another alternative here would be to use the vmovdqu16 instruction. Note that this latter instruction must be used in cases where merge or zero masking is required. AVX512BW also includes a vmovdqu8 instruction for 512-bit wide packed byte operands. Following operand value loading, Avx512PackedMathI16_ demonstrates the packed word instructions vpaddw, vpaddsw, vpsubw, vpsubsw, vpminsw, vpmaxsw. Each 512-bit packed word result is then saved in the array c. Note that Avx512PackedMathI16_ uses a vzeroupper instruction prior to its ret instruction.

The assembly language function Avx512PackedMathI64_ exemplifies various arithmetic operations using 512-bit wide packed quadword instructions. Note that this function includes an argument value named opmask, which is employed to highlight packed quadword zero masking . Avx512PackedMathI64_ also uses a vzeroupper instruction prior to its ret instruction. Here are the results for source code example Ch14_01.

Results for Avx512PackedMathI16

i a b vpaddw vpaddsw vpsubw vpsubsw vpminsw vpmaxsw

--------------------------------------------------------------------------

0 10 100 110 110 -90 -90 10 100

1 20 200 220 220 -180 -180 20 200

2 30 300 330 330 -270 -270 30 300

3 40 400 440 440 -360 -360 40 400

4 50 500 550 550 -450 -450 50 500

5 60 600 660 660 -540 -540 60 600

6 70 700 770 770 -630 -630 70 700

7 80 800 880 880 -720 -720 80 800

8 1000 -100 900 900 1100 1100 -100 1000

9 2000 200 2200 2200 1800 1800 200 2000

10 3000 -300 2700 2700 3300 3300 -300 3000

11 4000 400 4400 4400 3600 3600 400 4000

12 5000 -500 4500 4500 5500 5500 -500 5000

13 6000 600 6600 6600 5400 5400 600 6000

14 7000 -700 6300 6300 7700 7700 -700 7000

15 8000 800 8800 8800 7200 7200 800 8000

16 -1000 100 -900 -900 -1100 -1100 -1000 100

17 -2000 -200 -2200 -2200 -1800 -1800 -2000 -200

18 3000 303 3303 3303 2697 2697 303 3000

19 4000 -400 3600 3600 4400 4400 -400 4000

20 -5000 500 -4500 -4500 -5500 -5500 -5000 500

21 -6000 -600 -6600 -6600 -5400 -5400 -6000 -600

22 -7000 700 -6300 -6300 -7700 -7700 -7000 700

23 -8000 800 -7200 -7200 -8800 -8800 -8000 800

24 30000 3000 -32536 32767 27000 27000 3000 30000

25 6000 32000 -27536 32767 -26000 -26000 6000 32000

26 -25000 -27000 13536 -32768 2000 2000 -27000 -25000

27 8000 28700 -28836 32767 -20700 -20700 8000 28700

28 2000 -31000 -29000 -29000 -32536 32767 -31000 2000

29 4000 -30000 -26000 -26000 -31536 32767 -30000 4000

30 -3000 32000 29000 29000 30536 -32768 -3000 32000

31 -15000 24000 9000 9000 26536 -32768 -15000 24000

Results for Avx512PackedMathI64

op_mask = 7f

i a b vpaddq vpsubq vpmullq vpsllvq vpsravq vpabsq

----------------------------------------------------------------------

0 64 4 68 60 256 1024 4 64

1 1024 5 1029 1019 5120 32768 32 1024

2 -2048 2 -2046 -2050 -4096 -8192 -512 2048

3 8192 5 8197 8187 40960 262144 256 8192

4 -256 8 -248 -264 -2048 -65536 -1 256

5 4096 7 4103 4089 28672 524288 32 4096

6 16 3 19 13 48 128 2 16

7 512 6 0 0 0 0 0 0

Image Processing

The source code examples in this section explicate image-processing algorithms and techniques using AVX-512 packed integer instructions. Most of the source code examples are updated versions of examples from earlier chapters that exploited AVX or AVX2 instructions. Besides exemplifying AVX-512 packed integer instruction usage, the source code examples that follow also accentuate alternative algorithmic approaches and instruction sequences that often result in improved performance.

Pixel Conversions

In Chapter 7, you learned how to use the AVX instruction set to convert unsigned 8-bit pixels to single-precision floating-point pixels and vice versa (see example Ch07_06). Source code example Ch14_02 demonstrates how to carry out these same conversions using AVX-512 instructions. Listing 14-2 shows the source code for example Ch14_02.

//------------------------------------------------

// Ch14_02.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#include <cstdint>

#include <random>

#include "AlignedMem.h"

using namespace std;

// Ch14_02_Misc.cpp

extern bool Avx512ConvertImgU8ToF32Cpp(float* des, const uint8_t* src, uint32_t num_pixels);

extern bool Avx512ConvertImgF32ToU8Cpp(uint8_t* des, const float* src, uint32_t num_pixels);

extern uint32_t Avx512ConvertImgVerify(const float* src1, const float* src2, uint32_t num_pixels);

extern uint32_t Avx512ConvertImgVerify(const uint8_t* src1, const uint8_t* src2, uint32_t num_pixels);

// Ch14_02_.asm

extern "C" bool Avx512ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels);

extern "C" bool Avx512ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels);

void InitU8(uint8_t* x, uint32_t n, unsigned int seed)

{

uniform_int_distribution<> ui_dist {0, 255};

default_random_engine rng {seed};

for (uint32_t i = 0; i < n; i++)

x[i] = ui_dist(rng);

}

void InitF32(float* x, uint32_t n, unsigned int seed)

{

uniform_int_distribution<> ui_dist {0, 1000};

default_random_engine rng {seed};

for (uint32_t i = 0; i < n; i++)

x[i] = (float)ui_dist(rng) / 1000.0f;

}

void Avx512ConvertImgU8ToF32(void)

{

const size_t align = 64;

const uint32_t num_pixels = 1024;

AlignedArray<uint8_t> src_aa(num_pixels, align);

AlignedArray<float> des1_aa(num_pixels, align);

AlignedArray<float> des2_aa(num_pixels, align);

uint8_t* src = src_aa.Data();

float* des1 = des1_aa.Data();

float* des2 = des2_aa.Data();

InitU8(src, num_pixels, 12);

bool rc1 = Avx512ConvertImgU8ToF32Cpp(des1, src, num_pixels);

bool rc2 = Avx512ConvertImgU8ToF32_(des2, src, num_pixels);

cout << " Results for Avx512ConvertImgU8ToF32 ";

if (!rc1 || !rc2)

{

cout << "Invalid return code - ";

cout << "rc1 = " << boolalpha << rc1 << ", ";

cout << "rc2 = " << boolalpha << rc2 << ' ';

return;

}

uint32_t num_diff = Avx512ConvertImgVerify(des1, des2, num_pixels);

cout << " Number of pixel compare errors (num_diff) = " << num_diff << ' ';

}

void Avx512ConvertImgF32ToU8(void)

{

const size_t align = 64;

const uint32_t num_pixels = 1024;

AlignedArray<float> src_aa(num_pixels, align);

AlignedArray<uint8_t> des1_aa(num_pixels, align);

AlignedArray<uint8_t> des2_aa(num_pixels, align);

float* src = src_aa.Data();

uint8_t* des1 = des1_aa.Data();

uint8_t* des2 = des2_aa.Data();

InitF32(src, num_pixels, 20);

// Test values to demonstrate clipping in conversion functions

src[0] = 0.5f; src[8] = 3.33f;

src[1] = -1.0f; src[9] = 0.67f;

src[2] = 0.38f; src[10] = 0.75f;

src[3] = 0.62f; src[11] = 0.95f;

src[4] = 2.1f; src[12] = -0.33f;

src[5] = 0.25f; src[13] = 0.8f;

src[6] = -1.25f; src[14] = 0.12f;

src[7] = 0.45f; src[15] = 4.0f;

bool rc1 = Avx512ConvertImgF32ToU8Cpp(des1, src, num_pixels);

bool rc2 = Avx512ConvertImgF32ToU8_(des2, src, num_pixels);

cout << " Results for Avx512ConvertImgF32ToU8 ";

if (!rc1 || !rc2)

{

cout << "Invalid return code - ";

cout << "rc1 = " << boolalpha << rc1 << ", ";

cout << "rc2 = " << boolalpha << rc2 << ' ';

return;

}

uint32_t num_diff = Avx512ConvertImgVerify(des1, des2, num_pixels);

cout << " Number of pixel compare errors (num_diff) = " << num_diff << ' ';

}

int main()

{

Avx512ConvertImgU8ToF32();

Avx512ConvertImgF32ToU8();

return 0;

}

;-------------------------------------------------

; Ch14_02.asm

;-------------------------------------------------

include <cmpequ.asmh>

extern c_NumPixelsMax:dword

.const

r4_1p0 real4 1.0

r4_255p0 real4 255.0

; extern "C" bool Avx512ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels)

.code

Avx512ConvertImgU8ToF32_ proc

; Make sure num_pixels is valid and pixel buffers are properly aligned

xor eax,eax ;set error return code

or r8d,r8d

jz Done ;jump if num_pixels is zero

cmp r8d,[c_NumPixelsMax]

ja Done ;jump if num_pixels too big

test r8d,3fh

jnz Done ;jump if num_pixels % 64 != 0

test rcx,3fh

jnz Done ;jump if des not aligned

test rdx,3fh

jnz Done ;jump if src not aligned

; Perform required initializations

shr r8d,6 ;number of blocks (64 pixels/block)

vmovss xmm0,real4 ptr [r4_1p0]

vdivss xmm1,xmm0,real4 ptr [r4_255p0]

vbroadcastss zmm5,xmm1 ;packed scale factor (1.0 / 255.0)

align 16

@@: vpmovzxbd zmm0,xmmword ptr [rdx]

vpmovzxbd zmm1,xmmword ptr [rdx+16]

vpmovzxbd zmm2,xmmword ptr [rdx+32]

vpmovzxbd zmm3,xmmword ptr [rdx+48] ;zmm3:zmm0 = 64 U32 pixels

; Convert pixels from uint8_t to float [0.0, 255.0]

vcvtudq2ps zmm16,zmm0

vcvtudq2ps zmm17,zmm1

vcvtudq2ps zmm18,zmm2

vcvtudq2ps zmm19,zmm3 ;zmm19:zmm16 = 64 F32 pixels

; Normalize pixels to [0.0, 1.0]

vmulps zmm20,zmm16,zmm5

vmulps zmm21,zmm17,zmm5

vmulps zmm22,zmm18,zmm5

vmulps zmm23,zmm19,zmm5 ;zmm23:zmm20 = 64 F32 pixels (normalized)

; Save F32 pixels to des

vmovaps zmmword ptr [rcx],zmm20

vmovaps zmmword ptr [rcx+64],zmm21

vmovaps zmmword ptr [rcx+128],zmm22

vmovaps zmmword ptr [rcx+192],zmm23

; Update pointers and counters

add rdx,64

add rcx,256

sub r8d,1

jnz @B

mov eax,1 ;set success return code

Done: vzeroupper

ret

Avx512ConvertImgU8ToF32_ endp

; extern "C" bool Avx512ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels)

Avx512ConvertImgF32ToU8_ proc

; Make sure num_pixels is valid and pixel buffers are properly aligned

xor eax,eax ;set error return code

or r8d,r8d

jz Done ;jump if num_pixels is zero

cmp r8d,[c_NumPixelsMax]

ja Done ;jump if num_pixels too big

test r8d,3fh

jnz Done ;jump if num_pixels % 64 != 0

test rcx,3fh

jnz Done ;jump if des not aligned

test rdx,3fh

jnz Done ;jump if src not aligned

; Perform required initializations

shr r8d,4 ;number of pixel blocks (16 pixels / block)

vxorps zmm29,zmm29,zmm29 ;packed 0.0

vbroadcastss zmm30,[r4_1p0] ;packed 1.0

vbroadcastss zmm31,[r4_255p0] ;packed 255.0

align 16

@@: vmovaps zmm0,zmmword ptr [rdx] ;zmm0 = block of 16 pixels

; Clip pixels in current block to [0,0. 1.0]

vcmpps k1,zmm0,zmm29,CMP_GE ;k1 = mask of pixels >= 0.0

vmovaps zmm1{k1}{z},zmm0 ;all pixels >= 0.0

vcmpps k2,zmm1,zmm30,CMP_GT ;k2 = mask of pixels > 1.0

vmovaps zmm1{k2},zmm30 ;all pixels clipped to [0.0, 1.0]

; Convert pixels to uint8_t and save to des

vmulps zmm2,zmm1,zmm31 ;all pixels [0.0, 255.0]

vcvtps2udq zmm3,zmm2{ru-sae} ;all pixels [0, 255]

vpmovusdb xmmword ptr [rcx],zmm3 ;save pixels as unsigned bytes

; Update pointers and counters

add rdx,64

add rcx,16

sub r8d,1

jnz @B

mov eax,1 ;set success return code

Done: vzeroupper

ret

Avx512ConvertImgF32ToU8_ endp

end

Listing 14-2.

Example Ch14_02

The C++ code in Listing 14-2 begins with the requisite function declarations. The first declaration set is for the functions Avx512ConvertImgU8ToF32Cpp and Avx512ConvertImgU8ToF32Cpp, which are defined in the file Ch14_02_Misc.cpp. The source code for these functions is not shown since they’re almost identical to the AVX2 counterpart functions that were used in source code example Ch07_06. Two minor changes were made: the source and destination pixel buffers are aligned on a 64-byte instead of a 16-byte boundary; the number of pixels in these buffers must be evenly divisible by 64 instead of 32.

The function Avx512ConvertImgU8ToF32 initializes the test arrays for converting pixels values from uint8_t to float. This function uses the C++ template class AlignedArray<> to allocate these arrays on a 64-byte boundary. Following test array initialization, Avx512ConvertImgU8ToF32 invokes the C++ and assembly language conversion functions. It then calls Avx512ConvertImgVerify to verify the results. The function Avx512ConvertImgF32ToU8 converts pixel values from float to uint8_t. Note that this function intentionally initializes the first few values of the source pixel buffer src to known values in order to verify that the conversion functions properly clip out-of-range pixel values.

The assembly language function Avx512ConvertImgU8ToF32_ begins its execution by validating num_pixels. It then confirms that the pixel buffers src and des are properly aligned on a 64-byte boundary. In source code example Ch07_06 from Chapter 7, pixel normalization was performed by dividing each pixel value by 255.0. Avx512ConvertImgU8ToF32_ carries out pixel normalization using the multiplicative scale factor 1.0/255.0 since floating-point multiplication is usually faster than floating-point division. The vbroadcastss zmm5,xmm1 instruction loads a packed version of this scale factor into register ZMM5.

Each processing loop iteration starts with a vpmovzxbd zmm0,xmmword ptr [rdx] instruction. This instruction copies and zero-extends the 16-byte (or uint8_t) pixels pointed to by RDX to doublewords ; it then saves these values in register ZMM0. Three more vpmovzxbd instructions are then employed to load another 48 pixels into registers ZMM1, ZMM2, and ZMM3. This is followed by four vcvtudq2ps instructions that convert each unsigned doubleword pixel value in registers ZMM0–ZMM3 to single-precision floating-point. The ensuing vmulps instructions multiply these values by the normalization scale factor; the results are then saved to the destination pixel buffer des using a series of vmovaps instructions.

In source code example Ch07_06, all floating-point pixel values were clipped to [0.0, 1.0] before being converted to uint8_t values. The function Avx512ConvertImgF32ToU8_ also performs this same operation. Following its argument validation checks, Avx512ConvertImgF32ToU8_ loads registers ZMM29, ZMM30, and ZMM31 with packed versions of the single-precision floating-point constants 0.0, 1.0, and 255.0, respectively. The processing loop of Avx512ConvertImgF32ToU8_ begins each iteration with a vmovaps zmm0,zmmword ptr [rdx] that loads a block of 16 single-precision floating-point pixels into register ZMM0. The ensuing vcmpps k1,zmm0,zmm29,CMP_GE instruction compares each pixel element in ZMM0 to 0.0 and saves the resultant compare mask in opmask register K1. The next instruction, vmovaps zmm1{k1}{z},zmm0, uses zero masking to eliminate all pixel values less than 0.0. Figure 14-1 illustrates these operations.

../images/326959_2_En_14_Chapter/326959_2_En_14_Fig1_HTML.jpg — Figure 14-1.
Instruction sequence used to convert packed pixel values from floating-point to unsigned doubleword integers

The subsequent vcmpps k2,zmm1,zmm30,CMP_GT instruction creates a mask of pixel values greater than 1.0 and saves this mask in opmask register K2. Following execution of the vmovaps zmm1{k2},zmm30 instruction, all of the pixel values in register ZMM1 are greater than or equal to 0.0 and less than or equal to 1.0. The next two instructions, vmulps zmm2,zmm1,zmm31 and vcvtps2udq zmm3,zmm2{ru-sae}, convert the normalized floating-point pixel values to unsigned doubleword integers. Note that the vcvtps2udq instruction employs an instruction-level rounding control operand (round up) primarily for demonstration purposes. The ensuing vpmovusdb xmmword ptr [rcx],zmm3 instruction size-reduces the doubleword values to bytes using unsigned saturation and saves them in the destination buffer pointed to by RCX. Here are the results for source code example Ch14_02:

Results for Avx512ConvertImgU8ToF32

Number of pixel compare errors (num_diff) = 0

Results for Avx512ConvertImgF32ToU8

Number of pixel compare errors (num_diff) = 0

Image Thresholding

In source code example Ch07_08, you learned about image thresholding and how to create a binary (or two color) mask image . Briefly, thresholding is an image-processing technique that sets a mask image pixel to 0xff to signify that the intensity value of the corresponding pixel in a grayscale image is greater than a pre-determined threshold intensity value; otherwise, the mask image pixel is set to 0x00. The next source code example, Ch14_03, expands the image-thresholding technique that was used in Ch07_08 to support multiple compare operators. Listing 14-3 shows the source code for example Ch14_03.

//------------------------------------------------

// Ch14_03.h

//------------------------------------------------

#pragma once

#include <cstdint>

// Compare operators

enum CmpOp { EQ, NE, LT, LE, GT, GE };

// Ch14_03_Misc.cpp

extern void Init(uint8_t* x, size_t n, unsigned int seed);

extern void ShowResults(const uint8_t* des1, const uint8_t* des2, size_t num_pixels, CmpOp cmp_op,

uint8_t cmp_val, size_t test_id);

// Ch14_03_.asm

extern "C" bool Avx512ComparePixels_(uint8_t* des, const uint8_t* src, size_t num_pixels,

CmpOp cmp_op, uint8_t cmp_val);

//------------------------------------------------

// Ch14_03.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <cassert>

#include "Ch14_03.h"

#include "AlignedMem.h"

using namespace std;

extern "C" const size_t c_NumPixelsMax = 16777216;

bool Avx512ComparePixelsCpp(uint8_t* des, const uint8_t* src, size_t num_pixels, CmpOp cmp_op, uint8_t cmp_val)

{

// Make sure num_pixels is valid

if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))

return false;

if ((num_pixels & 0x3f) != 0)

return false;

// Make sure src and des are aligned on a 64-byte boundary

if (!AlignedMem::IsAligned(src, 64))

return false;

if (!AlignedMem::IsAligned(des, 64))

return false;

bool rc = true;

const uint8_t cmp_false = 0x00;

const uint8_t cmp_true = 0xff;

switch (cmp_op)

{

case CmpOp::EQ:

for (size_t i = 0; i < num_pixels; i++)

des[i] = (src[i] == cmp_val) ? cmp_true : cmp_false;

break;

case CmpOp::NE:

for (size_t i = 0; i < num_pixels; i++)

des[i] = (src[i] != cmp_val) ? cmp_true : cmp_false;

break;

case CmpOp::LT:

for (size_t i = 0; i < num_pixels; i++)

des[i] = (src[i] < cmp_val) ? cmp_true : cmp_false;

break;

case CmpOp::LE:

for (size_t i = 0; i < num_pixels; i++)

des[i] = (src[i] <= cmp_val) ? cmp_true : cmp_false;

break;

case CmpOp::GT:

for (size_t i = 0; i < num_pixels; i++)

des[i] = (src[i] > cmp_val) ? cmp_true : cmp_false;

break;

case CmpOp::GE:

for (size_t i = 0; i < num_pixels; i++)

des[i] = (src[i] >= cmp_val) ? cmp_true : cmp_false;

break;

default:

cout << "Invalid CmpOp: " << cmp_op << ' ';

rc = false;

}

return rc;

}

int main()

{

const size_t align = 64;

const size_t num_pixels = 4 * 1024 * 1024;

AlignedArray<uint8_t> src_aa(num_pixels, align);

AlignedArray<uint8_t> des1_aa(num_pixels, align);

AlignedArray<uint8_t> des2_aa(num_pixels, align);

uint8_t* src = src_aa.Data();

uint8_t* des1 = des1_aa.Data();

uint8_t* des2 = des2_aa.Data();

const uint8_t cmp_vals[] {197, 222, 43, 43, 129, 222};

const CmpOp cmp_ops[] {CmpOp::EQ, CmpOp::NE, CmpOp::LT, CmpOp::LE, CmpOp::GT, CmpOp::GE};

const size_t num_cmp_vals = sizeof(cmp_vals) / sizeof(uint8_t);

const size_t num_cmp_ops = sizeof(cmp_ops) / sizeof(CmpOp);

assert(num_cmp_vals == num_cmp_ops);

Init(src, num_pixels, 511);

cout << "Results for Ch14_03 ";

for (size_t i = 0; i < num_cmp_ops; i++)

{

Avx512ComparePixelsCpp(des1, src, num_pixels, cmp_ops[i], cmp_vals[i]);

Avx512ComparePixels_(des2, src, num_pixels, cmp_ops[i], cmp_vals[i]);

ShowResults(des1, des2, num_pixels, cmp_ops[i], cmp_vals[i], i + 1);

}

return 0;

}

;-------------------------------------------------

; Ch14_03.asm

;-------------------------------------------------

include <cmpequ.asmh>

extern c_NumPixelsMax:qword

; Macro CmpPixels

_CmpPixels macro CmpOp

align 16

@@: vmovdqa64 zmm0,zmmword ptr [rdx+rax] ;load next block of 64 pixels

vpcmpub k1,zmm0,zmm4,CmpOp ;perform compare operation

vmovdqu8 zmm1{k1}{z},zmm5 ;set mask pixels to 0 or 255 using opmask

vmovdqa64 zmmword ptr [rcx+rax],zmm1 ;save mask pixels

add rax,64 ;update offset

sub r8,64

jnz @B ;repeat until done

mov eax,1 ;set success return code

vzeroupper

ret

endm

; extern "C" bool Avx512ComparePixels_(uint8_t* des, const uint8_t* src,

; size_t num_pixels, CmpOp cmp_op, uint8_t cmp_val);

.code

Avx512ComparePixels_ proc

; Make sure num_pixels is valid and pixel buffers are properly aligned

xor eax,eax ;set error code (also array offset)

or r8,r8

jz Done ;jump if num_pixels is zero

cmp r8,[c_NumPixelsMax]

ja Done ;jump if num_pixels too big

test r8,3fh

jnz Done ;jump if num_pixels % 64 != 0

test rcx,3fh

jnz Done ;jump if des not aligned

test rdx,3fh

jnz Done ;jump if src not aligned

; Perform required initializations

vpbroadcastb zmm4,byte ptr [rsp+40] ;zmm4 = packed cmp_val

mov r10d,255

vpbroadcastb zmm5,r10d ;zmm5 = packed 255

; Perform specified compare operation

cmp r9d,0

jne LB_NE

_CmpPixels CMP_EQ ;CmpOp::EQ

LB_NE: cmp r9d,1

jne LB_LT

_CmpPixels CMP_NEQ ;CmpOp::NE

LB_LT: cmp r9d,2

jne LB_LE

_CmpPixels CMP_LT ;CmpOp::LT

LB_LE: cmp r9d,3

jne LB_GT

_CmpPixels CMP_LE ;CmpOp::LE

LB_GT: cmp r9d,4

jne LB_GE

_CmpPixels CMP_NLE ;CmpOp::GT

LB_GE: cmp r9d,5

jne Done

_CmpPixels CMP_NLT ;CmpOp::GE

Done: vzeroupper

ret

Avx512ComparePixels_ endp

end

Listing 14-3.

Example Ch14_03

Near the top of the header file Ch14_03.h is an enum named CmpOp, which contains identifiers for the common compare operations. This is followed by the example’s function declarations. The C++ functions Init and ShowResults are ancillary functions that perform test array initialization and display results. The source code for these functions is not shown in Listing 14-3 but included with the chapter download package. The function Avx512ComparePixels_ is an AVX-512 assembly language functions that implements the pixel thresholding algorithm.

The function Avx512ComparePixelsCpp contains the C++ implementation of the updated thresholding algorithm. This function begins its execution by validating num_pixels for size and divisibility by 64. It then verifies that the pixel buffers src and des are properly aligned on a 64-byte boundary. Following argument validation code is a switch statement that applies the selector cmp_op to select a compare operation. Each switch statement case code block is a simple for loop that compares src[i] against cmp_val using the specified operator and sets pixels in the mask image to 0xff (true compare) or 0x00 (false compare). The function main includes code that allocates the image pixel buffers, exercises the functions Avx512ComparePixelsCpp and Avx512ComparePixels_ using various compare operators and displays results.

The assembly language code in Listing 14-3 commences with the macro _CmpPixels. This macro generates AVX-512 code that implements a processing loop for a pixel compare operator. The macro _CmpPixels requires the following register initializations prior to its use: RAX = 0, RCX = mask image pixel buffer, RDX = grayscale image pixel buffer, R8 = number of pixels, ZMM4 = packed byte threshold values, and ZMM5 = packed 0xff byte values. Each processing loop iteration of _CmpPixels begins with a vmovdqa64 zmm0,zmmword ptr [rdx+rax] instruction that loads 64 unsigned 8-bit integers into register ZMM0. The next instruction, vpcmpub k1,zmm0,zmm4,CmpOp, compares the grayscale pixel intensity values in ZMM0 to the packed values in ZMM4; it then saves the resultant mask in opmask register K1. The ensuing vmovdqu8 zmm1{k1}{z},zmm5 instruction sets each mask pixel value in ZMM1 to 0xff (true compare) or 0x00 (false compare) according to the value of the corresponding bit position in K1. The instruction vmovdqa64 zmmword ptr [rcx+rax],zmm1 then saves the 64 mask pixels to the mask image pixel buffer.

The function Avx512ComparePixels_ employs the macro _CmpPixels to implement the same algorithm as its C++ counterpart Avx512ComparePixelsCpp. Follow the requisite argument validation checks, a vpbroadcastb zmm4,byte ptr [rsp+40] instruction broadcasts cmp_val to each byte element in register ZMM4. The next two instructions, mov r10d,255 and vpbroadcastb zmm5,r10d, load the value 0xff into each byte element of ZMM5. The remaining code in Avx512ComparePixels_ uses the argument value cmp_val to implement an ad hoc switch statement that takes advantage of the macro _CmpPixels. Note that this function uses the compare equates CMP_NLE (not less than or equal) or CMP_NLT (not less than) for the _CmpPixels macro argument CmpOp instead of CMP_GT or CMP_GE. The reason for this is that the vpcmpub instruction in _CmpPixels does not support use of the CMP_GT and CMP_GE equates (mathematically these latter equates are equivalent to CMP_NLE and CMP_NLT, but are assigned different values in cmpequ.asmh). Here are the results for source code example Ch14_03:

Results for Ch14_03

Test #1

num_pixels: 4194304

cmp_op: EQ

cmp_val: 197

Pixel masks are identical

Number of non-zero mask pixels = 16424

Test #2

num_pixels: 4194304

cmp_op: NE

cmp_val: 222

Pixel masks are identical

Number of non-zero mask pixels = 4177927

Test #3

num_pixels: 4194304

cmp_op: LT

cmp_val: 43

Pixel masks are identical

Number of non-zero mask pixels = 703652

Test #4

num_pixels: 4194304

cmp_op: LE

cmp_val: 43

Pixel masks are identical

Number of non-zero mask pixels = 719787

Test #5

num_pixels: 4194304

cmp_op: GT

cmp_val: 129

Pixel masks are identical

Number of non-zero mask pixels = 2065724

Test #6

num_pixels: 4194304

cmp_op: GE

cmp_val: 222

Pixel masks are identical

Number of non-zero mask pixels = 556908

Image Statistics

Listing 14-4 shows the source code for example Ch14_04. This example illustrates how to calculate the mean and standard deviation of a grayscale image using its pixel intensity values. In order to make source code example Ch14_04 a little more interesting, the C++ and assembly language functions use only the pixel values that reside between two threshold limits. Pixel values outside of these limits are excluded from any mean and standard deviation calculations.

//------------------------------------------------

// Ch14_04.h

//------------------------------------------------

#pragma once

#include <cstdint>

// This structure must match the structure that's defined in Ch14_04_.asm.

struct ImageStats

{

uint8_t* m_PixelBuffer;

uint64_t m_NumPixels;

uint32_t m_PixelValMin;

uint32_t m_PixelValMax;

uint64_t m_NumPixelsInRange;

uint64_t m_PixelSum;

uint64_t m_PixelSumOfSquares;

double m_PixelMean;

double m_PixelSd;

};

// Ch14_04.cpp

extern bool Avx512CalcImageStatsCpp(ImageStats& im_stats);

// Ch14_04_.asm

extern "C" bool Avx512CalcImageStats_(ImageStats& im_stats);

// Ch04_04_BM.cpp

extern void Avx512CalcImageStats_BM(void);

// Common constants

const uint32_t c_PixelValMin = 40;

const uint32_t c_PixelValMax = 230;

//------------------------------------------------

// Ch14_04.cpp

//------------------------------------------------

#include "stdafx.h"

#include <cstdint>

#include <iostream>

#include <iomanip>

#include <fstream>

#include <string>

#include <stdexcept>

#include "Ch14_04.h"

#include "AlignedMem.h"

#include "ImageMatrix.h"

using namespace std;

extern "C" uint64_t c_NumPixelsMax = 256 * 1024;

bool Avx512CalcImageStatsCpp(ImageStats& im_stats)

{

uint64_t num_pixels = im_stats.m_NumPixels;

const uint8_t* pb = im_stats.m_PixelBuffer;

// Perform validation checks

if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))

return false;

if (!AlignedMem::IsAligned(pb, 64))

return false;

// Calculate intermediate sums

im_stats.m_PixelSum = 0;

im_stats.m_PixelSumOfSquares = 0;

im_stats.m_NumPixelsInRange = 0;

for (size_t i = 0; i < num_pixels; i++)

{

uint32_t pval = pb[i];

if (pval >= im_stats.m_PixelValMin && pval <= im_stats.m_PixelValMax)

{

im_stats.m_PixelSum += pval;

im_stats.m_PixelSumOfSquares += pval * pval;

im_stats.m_NumPixelsInRange++;

}

// Calculate mean and standard deviation

double temp0 = (double)im_stats.m_NumPixelsInRange * im_stats.m_PixelSumOfSquares;

double temp1 = (double)im_stats.m_PixelSum * im_stats.m_PixelSum;

double var_num = temp0 - temp1;

double var_den = (double)im_stats.m_NumPixelsInRange * (im_stats.m_NumPixelsInRange - 1);

double var = var_num / var_den;

im_stats.m_PixelMean = (double)im_stats.m_PixelSum / im_stats.m_NumPixelsInRange;

im_stats.m_PixelSd = sqrt(var);

return true;

}

void Avx512CalcImageStats()

{

const wchar_t* image_fn = L"..\Ch14_Data\TestImage4.bmp";

ImageStats is1, is2;

ImageMatrix im(image_fn);

uint64_t num_pixels = im.GetNumPixels();

uint8_t* pb = im.GetPixelBuffer<uint8_t>();

is1.m_PixelBuffer = pb;

is1.m_NumPixels = num_pixels;

is1.m_PixelValMin = c_PixelValMin;

is1.m_PixelValMax = c_PixelValMax;

is2.m_PixelBuffer = pb;

is2.m_NumPixels = num_pixels;

is2.m_PixelValMin = c_PixelValMin;

is2.m_PixelValMax = c_PixelValMax;

const char nl = ' ';

const char* s = " | ";

const unsigned int w1 = 22;

const unsigned int w2 = 12;

cout << fixed << setprecision(6) << left;

wcout << fixed << setprecision(6) << left;

cout << " Results for Avx512CalcImageStats ";

wcout << setw(w1) << "image_fn:" << setw(w2) << image_fn << nl;

cout << setw(w1) << "num_pixels:" << setw(w2) << num_pixels << nl;

cout << setw(w1) << "c_PixelValMin:" << setw(w2) << c_PixelValMin << nl;

cout << setw(w1) << "c_PixelValMax:" << setw(w2) << c_PixelValMax << nl;

bool rc1 = Avx512CalcImageStatsCpp(is1);

bool rc2 = Avx512CalcImageStats_(is2);

if (!rc1 || !rc2)

{

cout << "Bad return code ";

cout << " rc1 = " << rc1 << ' ';

cout << " rc2 = " << rc2 << ' ';

return;

}

cout << nl;

cout << setw(w1) << "m_NumPixelsInRange: ";

cout << setw(w2) << is1.m_NumPixelsInRange << s;

cout << setw(w2) << is2.m_NumPixelsInRange << nl;

cout << setw(w1) << "m_PixelSum:";

cout << setw(w2) << is1.m_PixelSum << s;

cout << setw(w2) << is2.m_PixelSum << nl;

cout << setw(w1) << "m_PixelSumOfSquares:";

cout << setw(w2) << is1.m_PixelSumOfSquares << s;

cout << setw(w2) << is2.m_PixelSumOfSquares << nl;

cout << setw(w1) << "m_PixelMean:";

cout << setw(w2) << is1.m_PixelMean << s;

cout << setw(w2) << is2.m_PixelMean << nl;

cout << setw(w1) << "m_PixelSd:";

cout << setw(w2) << is1.m_PixelSd << s;

cout << setw(w2) << is2.m_PixelSd << nl;

}

int main()

{

try

{

Avx512CalcImageStats();

Avx512CalcImageStats_BM();

}

catch (runtime_error& rte)

{

cout << "'runtime_error' exception has occurred - " << rte.what() << ' ';

}

catch (...)

{

cout << "Unexpected exception has occurred ";

cout << "File = " << __FILE__ << ' ';

}

return 0;

}

;-------------------------------------------------

; Ch14_04.asm

;-------------------------------------------------

include <cmpequ.asmh>

include <MacrosX86-64-AVX.asmh>

extern c_NumPixelsMax:qword

; This structure must match the structure that's defined in Ch14_04.h

ImageStats struct

PixelBuffer qword ?

NumPixels qword ?

PixelValMin dword ?

PixelValMax dword ?

NumPixelsInRange qword ?

PixelSum qword ?

PixelSumOfSquares qword ?

PixelMean real8 ?

PixelSd real8 ?

ImageStats ends

_UpdateSums macro Disp

vpmovzxbd zmm0,xmmword ptr [rcx+Disp] ;zmm0 = 16 pixels

vpcmpud k1,zmm0,zmm31,CMP_GE ;k1 = mask of pixels >= pixel_val_min

vpcmpud k2,zmm0,zmm30,CMP_LE ;k2 = mask of pixels <= pixel_val_max

kandw k3,k2,k1 ;k3 = mask of in-range pixels

vmovdqa32 zmm1{k3}{z},zmm0 ;zmm1 = in-range pixels

vpaddd zmm16,zmm16,zmm1 ;update packed pixel_sum

vpmulld zmm2,zmm1,zmm1

vpaddd zmm17,zmm17,zmm2 ;update packed pixel_sum_of_squares

kmovw rax,k3

popcnt rax,rax ;count number of in-range pixels

add r10,rax ;update num_pixels_in_range

endm

; extern "C" bool Avx512CalcImageStats_(ImageStats& im_stats);

.code

Avx512CalcImageStats_ proc frame

_CreateFrame CIS_,0,0,rsi,r12,r13

_EndProlog

; Make sure num_pixels is valid and pixel_buff is properly aligned

xor eax,eax ;set error return code

mov rsi,rcx ;rsi = im_stats ptr

mov rcx,qword ptr [rsi+ImageStats.PixelBuffer] ;rcx = pixel buffer ptr

mov rdx,qword ptr [rsi+ImageStats.NumPixels] ;rdx = num_pixels

test rdx,rdx

jz Done ;jump if num_pixels is zero

cmp rdx,[c_NumPixelsMax]

ja Done ;jump if num_pixels too big

test rcx,3fh

jnz Done ;jump if pixel_buff misaligned

; Perform required initializations

mov r8d,dword ptr [rsi+ImageStats.PixelValMin]

mov r9d,dword ptr [rsi+ImageStats.PixelValMax]

vpbroadcastd zmm31,r8d ;packed pixel_val_min

vpbroadcastd zmm30,r9d ;packed pixel_val_max

vpxorq zmm29,zmm29,zmm29 ;packed pixel_sum

vpxorq zmm28,zmm28,zmm28 ;packed pixel_sum_of_squares

xor r10d,r10d ;num_pixels_in_range = 0

; Compute packed versions of pixel_sum and pixel_sum_of_squares

cmp rdx,64

jb LB1 ;jump if there are fewer than 64 pixels

align 16

@@: vpxord zmm16,zmm16,zmm16 ;loop packed pixel_sum = 0

vpxord zmm17,zmm17,zmm17 ;loop packed pixel_sum_of_squares = 0

_UpdateSums 0 ;process pixel_buff[i+15]:pixel_buff[i]

_UpdateSums 16 ;process pixel_buff[i+31]:pixel_buff[i+16]

_UpdateSums 32 ;process pixel_buff[i+47]:pixel_buff[i+32]

_UpdateSums 48 ;process pixel_buff[i+63]:pixel_buff[i+48]

vextracti32x8 ymm0,zmm16,1 ;extract top 8 pixel_sum (dwords)

vpaddd ymm1,ymm0,ymm16

vpmovzxdq zmm2,ymm1

vpaddq zmm29,zmm29,zmm2 ;update packed pixel_sum (qwords)

vextracti32x8 ymm0,zmm17,1 ;extract top 8 pixel_sum_of_squares (dwords)

vpaddd ymm1,ymm0,ymm17

vpmovzxdq zmm2,ymm1

vpaddq zmm28,zmm28,zmm2 ;update packed pixel_sum_of_squares (qwords)

add rcx,64 ;update pb ptr

sub rdx,64 ;update num_pixels

cmp rdx,64

jae @B ;repeat until done

align 16

LB1: test rdx,rdx

jz LB3 ;jump if no more pixels remain

xor r13,r13 ;pixel_sum = 0

xor r12,r12 ;pixel_sum_of_squares = 0

mov r11,rdx ;number of remaining pixels

@@: movzx rax,byte ptr [rcx] ;load next pixel

cmp rax,r8

jb LB2 ;jump if current pixel < pval_min

cmp rax,r9

ja LB2 ;jump if current pixel > pval_max

add r13,rax ;add to pixel_sum

mul rax

add r12,rax ;add to pixel_sum_of_squares

add r10,1 ;update num_pixels_in_range

LB2: add rcx,1

sub r11,1

jnz @B ;repeat until done

; Save num_pixel_in_range

LB3: mov qword ptr [rsi+ImageStats.NumPixelsInRange],r10

; Reduce packed pixel_sum to single qword

vextracti64x4 ymm0,zmm29,1

vpaddq ymm1,ymm0,ymm29

vextracti64x2 xmm2,ymm1,1

vpaddq xmm3,xmm2,xmm1

vpextrq rax,xmm3,0

vpextrq r11,xmm3,1

add rax,r11 ;rax = sum of qwords in zmm29

add r13,rax ;add scalar pixel_sum

mov qword ptr [rsi+ImageStats.PixelSum],r13

;Reduce packed pixel_sum_of_squares to single qword

vextracti64x4 ymm0,zmm28,1

vpaddq ymm1,ymm0,ymm28

vextracti64x2 xmm2,ymm1,1

vpaddq xmm3,xmm2,xmm1

vpextrq rax,xmm3,0

vpextrq r11,xmm3,1

add rax,r11 ;rax = sum of qwords in zmm28

add r12,rax ;add scalar pixel_sum_of_squares

mov qword ptr [rsi+ImageStats.PixelSumOfSquares],r12

; Calculate final mean and sd

vcvtusi2sd xmm0,xmm0,r10 ;num_pixels_in_range (DPFP)

sub r10,1

vcvtusi2sd xmm1,xmm1,r10 ;num_pixels_in_range - 1 (DPFP)

vcvtusi2sd xmm2,xmm2,r13 ;pixel_sum (DPFP)

vcvtusi2sd xmm3,xmm3,r12 ;pixel_sum_of_squares (DPFP)

vdivsd xmm4,xmm2,xmm0 ;final pixel_mean

vmovsd real8 ptr [rsi+ImageStats.PixelMean],xmm4

vmulsd xmm4,xmm0,xmm3 ;num_pixels_in_range * pixel_sum_of_squares

vmulsd xmm5,xmm2,xmm2 ;pixel_sum * pixel_sum

vsubsd xmm2,xmm4,xmm5 ;var_num

vmulsd xmm3,xmm0,xmm1 ;var_den

vdivsd xmm4,xmm2,xmm3 ;calc variance

vsqrtsd xmm0,xmm0,xmm4 ;final pixel_sd

vmovsd real8 ptr [rsi+ImageStats.PixelSd],xmm0

mov eax,1 ;set success return code

Done: vzeroupper

_DeleteFrame rsi,r12,r13

ret

Avx512CalcImageStats_ endp

end

Listing 14-4.

Example Ch14_04

The mean and standard deviation of the pixels in a grayscale image can be calculated using the following equations:

$overline{x}=frac{1}{n}sum limits_i{x}_i$

$s=sqrt{frac{nsum limits_i{x}_i^2-{left(sum limits_i{x}_i ight)}^2}{n;left(n-1 ight)}}$

In the mean and standard deviation equations, the symbol x _i represents an image buffer pixel and n denotes the number of pixels. If you study these equations carefully, you will notice that two intermediate sums must be calculated: the sum of all pixels and the sum of all pixel values squared. Once these quantities are known, the mean and standard deviation can be determined using simple arithmetic. The standard deviation equation that’s detailed here is simple to calculate and suitable for this source code example. For other use cases, however, this same equation is often unsuitable for standard deviation calculations especially those that involve floating-point values. You may want to consult the statistical variance calculating references that are listed in Appendix A before using this equation in one of your own programs.

Listing 14-4 begins with the C++ header file Ch14_04.h that includes the declaration of a structure named ImageStats. This structure is used to pass image data to the C++ and assembly language calculating functions and return results. A semantically equivalent structure is also defined in the assembly language file Ch14_04_.asm. The file Ch14_04.h also includes the constant definitions c_PixelValMin and c_PixelValueMax, which define the range limits that a pixel value must fall between to be included in any statistical calculations.

The function Avx512CalcImageStatsCpp is the principal calculating function in the C++ code. This function requires a pointer to an ImageStats structure as its sole argument. Following argument validation, Avx512CalcImageStatsCpp initializes the ImageStats intermediate sums m_PixelSum, m_PixelSumOfSquares, and m_NumPixelsInRange to zero. A simple for loop follows, which calculates m_PixelSum and m_PixelSumOfSquares. During each loop iteration, pixel values are tested for in-range validity before being included in any calculations. Following computation of the intermediate sums, the function Avx512CalcImageStatsCpp calculates the final mean and standard deviation . Note that m_NumPixelsInRange is used to calculate these statistical quantities instead of m_NumPixels. The remaining code in Ch14_04.cpp performs test case initialization, invokes the calculating functions, and streams the results to cout.

Toward the top of the file Ch14_04_.asm is the assembly language version of the structure ImageStats. This is followed by the macro definition _UpdateSums whose inner workings will be described shortly. The function Avx512CalcImageStats_ begins its execution by performing the same argument validation checks as its C++ counterpart. It then initializes packed versions of the intermediate values PixelValMin and PixelValMax. The ensuing vpxorq instructions initialize packed quadword versions of PixelSum and PixelSumOfSquares to zero. Note that the vpxor[d|q] (and other AVX-512 bitwise Boolean) instructions can optionally specify an opmask operand register to perform merge or zero masking of doubleword or quadword elements. The final initialization instruction, xor r10d,r10d, sets NumPixelsInRange to zero.

The processing loop in function Avx512CalcImageStats_ processes 64 pixels each iteration. Prior to the start of the processing loop, register RDX is tested to verify that at least 64 pixels remain. Each processing loop iteration begins with two vpxord instructions that initialize packed doubleword versions of pixel_sum and pixel_sum_of_squares to zero. Following this are four instances of the macro _UpdateSum, which process in aggregate the next group of 64 pixels. The first instruction of this macro, vpmovzxbd zmm0,xmmword ptr [rcx+Disp], loads 16 unsigned byte values from source pixel buffer and saves these values as unsigned doublewords in register ZMM0. The ensuing vpcmpud k1,zmm0,zmm31,CMP_GE, vpcmpud k2,zmm0,zmm30,CMP_LE, and kandw k3,k2,k1 instructions load opmask register K3 with a mask value of pixels that are greater than or equal to pixel_val_min and less than or equal to pixel_val_max. The vmovdqa32 zmm1{k3}{z},zmm0 that follows uses zero masking to effectively eliminate out-of-range pixel values from further calculations. The subsequent vpaddd and vpmulld instructions then update the packed doubleword quantities pixel_sum and pixel_sum_of_squares. The total number of in-range pixels in R10 is then updated using the instructions kmovw rax,k3, popcnt rax,rax, and add r10,rax. Figure 14-2 illustrates these calculations in greater detail. Note that this figure shows only the low-order 256 bits of each ZMM registers and the low-order 8 bits of each opmask register.

../images/326959_2_En_14_Chapter/326959_2_En_14_Fig2_HTML.jpg — Figure 14-2.
Calculations performed by instructions in macro *_UpdateSums*

Following the four _UpdateSums usages, the doubleword elements of registers ZMM16 and ZMM17 contain packed copies of the values pixel_sum and pixel_sum_of_squares for the current block of 64 pixels. The vextracti32x8 ymm0,zmm16,1 and vpaddd ymm1,ymm0,ymm16 instructions reduce the number of doubleword values in register ZMM16 from 16 to 8. The ensuing vpmovzxdq zmm2,ymm1 instruction promotes these doubleword values to quadwords, and the vpaddq zmm29,zmm29,zmm2 instruction updates the global packed quadword pixel_sum values that are maintained in register ZMM29. A similar sequence of instructions is then used to update the global packed quadword pixel_sum_of_squares values in register ZMM28. Following these instructions, the processing loop updates its pointer register and counters; it then repeats until the number of remaining pixels falls below 64.

The block of code that starts at the label LB1 computes pixel_sum and pixel_sum_of_squares for the final few pixels (if any) using scalar integer arithmetic and the general-purpose registers. A series of extract (vextracti64x4, vextracti64x2, and vpextrq) and vpaddq instructions reduce the eight packed quadword pixel_sum values in ZMM29 to a single quadword value. A similar sequence of instructions is then used to calculate the final value for pixel_sum_of_squares. Note that these intermediate results are saved in the ImageStats structure pointed to by register RCX. The function Avx512CalcImageStats_ then executes a chain of vcvtusi2sd instructions to convert the intermediate results from unsigned quadword integers to double-precision floating-point. The final mean and standard deviation values are calculated using scalar double-precision floating-point arithmetic. The results for source code example Ch14_04 follow this paragraph. Table 14-1 shows benchmark timing measurements for the C++ and assembly language calculating functions Avx512CalcImageStatsCpp and Avx512CalcImageStats_.

Results for Avx512CalcImageStats

image_fn: ..Ch14_DataTestImage4.bmp

num_pixels: 258130

c_PixelValMin: 40

c_PixelValMax: 230

m_NumPixelsInRange: 229897 | 229897

m_PixelSum: 32574462 | 32574462

m_PixelSumOfSquares: 5139441032 | 5139441032

m_PixelMean: 141.691549 | 141.691549

m_PixelSd: 47.738056 | 47.738056

Running benchmark function Avx512CalcImageStats_BM - please wait

Benchmark times save to file Ch14_04_Avx512CalcImageStats_BM_CHROMIUM.csv

Table 14-1.

Benchmark Timing Measurements for Image Statistics Calculating Functions Using TestImage4.bmp

CPU	Avx512CalcImageStatsCpp	Avx512CalcImageStats_
i7-4790S	----	----
i9-7900X	404	29
i7-8700K	----	----

RGB to Grayscale Conversion

In Chapter 10, you learned how to use the AVX2 instruction set to convert an RGB image to a grayscale image (see example Ch10_06). Listing 14-5 shows the source code for example Ch14_05, which illustrates RGB to grayscale image conversion using the AVX-512 instruction set.

//------------------------------------------------

// Ch14_05.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <stdexcept>

#include <iomanip>

#include "Ch14_05.h"

#include "ImageMatrix.h"

#include "AlignedMem.h"

using namespace std;

extern "C" const int c_NumPixelsMin = 64;

extern "C" const int c_NumPixelsMax = 16 * 1024 * 1024;

// RGB to grayscale conversion coefficients

const float c_Coef[3] {0.2126f, 0.7152f, 0.0722f};

bool CompareGsImages(const uint8_t* pb_gs1,const uint8_t* pb_gs2, int num_pixels)

{

for (int i = 0; i < num_pixels; i++)

{

if (abs((int)pb_gs1[i] - (int)pb_gs2[i]) > 1)

return false;

}

return true;

}

bool Avx512RgbToGsCpp(uint8_t* pb_gs, const uint8_t* const* pb_rgb, int num_pixels, const float coef[3])

{

if (num_pixels < c_NumPixelsMin || num_pixels > c_NumPixelsMax)

return false;

if (num_pixels % 64 != 0)

return false;

if (!AlignedMem::IsAligned(pb_gs, 64))

return false;

const size_t align = 64;

const uint8_t* pb_r = pb_rgb[0];

const uint8_t* pb_g = pb_rgb[1];

const uint8_t* pb_b = pb_rgb[2];

if (!AlignedMem::IsAligned(pb_r, align))

return false;

if (!AlignedMem::IsAligned(pb_g, align))

return false;

if (!AlignedMem::IsAligned(pb_b, align))

return false;

for (int i = 0; i < num_pixels; i++)

{

uint8_t r = pb_r[i];

uint8_t g = pb_g[i];

uint8_t b = pb_b[i];

float gs_temp = r * coef[0] + g * coef[1] + b * coef[2] + 0.5f;

if (gs_temp < 0.0f)

gs_temp = 0.0f;

else if (gs_temp > 255.0f)

gs_temp = 255.0f;

pb_gs[i] = (uint8_t)gs_temp;

}

return true;

}

void Avx512RgbToGs(void)

{

const wchar_t* fn_rgb = L"..\Ch14_Data\TestImage3.bmp";

const wchar_t* fn_gs1 = L"Ch14_05_Avx512RgbToGs_TestImage3_GS1.bmp";

const wchar_t* fn_gs2 = L"Ch14_05_Avx512RgbToGs_TestImage3_GS2.bmp";

const wchar_t* fn_gs3 = L"Ch14_05_Avx512RgbToGs_TestImage3_GS3.bmp";

ImageMatrix im_rgb(fn_rgb);

int im_h = im_rgb.GetHeight();

int im_w = im_rgb.GetWidth();

int num_pixels = im_h * im_w;

ImageMatrix im_r(im_h, im_w, PixelType::Gray8);

ImageMatrix im_g(im_h, im_w, PixelType::Gray8);

ImageMatrix im_b(im_h, im_w, PixelType::Gray8);

RGB32* pb_rgb = im_rgb.GetPixelBuffer<RGB32>();

uint8_t* pb_r = im_r.GetPixelBuffer<uint8_t>();

uint8_t* pb_g = im_g.GetPixelBuffer<uint8_t>();

uint8_t* pb_b = im_b.GetPixelBuffer<uint8_t>();

uint8_t* pb_rgb_cp[3] {pb_r, pb_g, pb_b};

for (int i = 0; i < num_pixels; i++)

{

pb_rgb_cp[0][i] = pb_rgb[i].m_R;

pb_rgb_cp[1][i] = pb_rgb[i].m_G;

pb_rgb_cp[2][i] = pb_rgb[i].m_B;

}

ImageMatrix im_gs1(im_h, im_w, PixelType::Gray8);

ImageMatrix im_gs2(im_h, im_w, PixelType::Gray8);

ImageMatrix im_gs3(im_h, im_w, PixelType::Gray8);

uint8_t* pb_gs1 = im_gs1.GetPixelBuffer<uint8_t>();

uint8_t* pb_gs2 = im_gs2.GetPixelBuffer<uint8_t>();

uint8_t* pb_gs3 = im_gs3.GetPixelBuffer<uint8_t>();

// Exercise conversion functions

bool rc1 = Avx512RgbToGsCpp(pb_gs1, pb_rgb_cp, num_pixels, c_Coef);

bool rc2 = Avx512RgbToGs_(pb_gs2, pb_rgb_cp, num_pixels, c_Coef);

bool rc3 = Avx2RgbToGs_(pb_gs3, pb_rgb_cp, num_pixels, c_Coef);

if (rc1 && rc2 && rc3)

{

im_gs1.SaveToBitmapFile(fn_gs1);

im_gs2.SaveToBitmapFile(fn_gs2);

im_gs2.SaveToBitmapFile(fn_gs3);

bool c1 = CompareGsImages(pb_gs1, pb_gs2, num_pixels);

bool c2 = CompareGsImages(pb_gs2, pb_gs3, num_pixels);

if (c1 && c2)

cout << "Grayscale image compare OK ";

else

cout << "Grayscale image compare failed ";

}

else

cout << "Invalid return code ";

}

int main()

{

try

{

Avx512RgbToGs();

Avx512RgbToGs_BM();

}

catch (runtime_error& rte)

{

cout << "'runtime_error' exception has occurred - " << rte.what() << ' ';

}

catch (...)

{

cout << "Unexpected exception has occurred ";

}

return 0;

}

;-------------------------------------------------

; Ch14_05.asm

;-------------------------------------------------

include <MacrosX86-64-AVX.asmh>

extern c_NumPixelsMin:dword

extern c_NumPixelsMax:dword

.const

r4_0p5 real4 0.5

r4_255p0 real4 255.0

; extern "C" bool Avx512RgbToGs_(uint8_t* pb_gs, const uint8_t* const* pb_rgb, int num_pixels, const float coef[3]);

.code

Avx512RgbToGs_ proc frame

_CreateFrame RGBGS0_,0,96,r13,r14,r15

_SaveXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15

_EndProlog

xor eax,eax ;error return code (also pixel_buffer offset)

cmp r8d,[c_NumPixelsMin]

jl Done ;jump if num_pixels < min value

cmp r8d,[c_NumPixelsMax]

jg Done ;jump if num_pixels > max value

test r8d,3fh

jnz Done ;jump if (num_pixels % 64) != 0

test rcx,3fh

jnz Done ;jump if pb_gs is not aligned

mov r13,[rdx]

test r13,3fh

jnz Done ;jump if pb_r is not aligned

mov r14,[rdx+8]

test r14,3fh

jnz Done ;jump if pb_g is not aligned

mov r15,[rdx+16]

test r15,3fh

jnz Done ;jump if pb_b is not aligned

; Perform required initializations

vbroadcastss zmm10,real4 ptr [r9] ;zmm10 = packed coef[0]

vbroadcastss zmm11,real4 ptr [r9+4] ;zmm11 = packed coef[1]

vbroadcastss zmm12,real4 ptr [r9+8] ;zmm12 = packed coef[2]

vbroadcastss zmm13,real4 ptr [r4_0p5] ;zmm13 = packed 0.5

vbroadcastss zmm14,real4 ptr [r4_255p0] ;zmm14 = packed 255.0

vxorps zmm15,zmm15,zmm15 ;zmm15 = packed 0.0

mov r8d,r8d ;r8 = num_pixels

mov r10,16 ;r10 - number of pixels / iteration

; Load next block of pixels

align 16

@@: vpmovzxbd zmm0,xmmword ptr [r13+rax] ;zmm0 = 16 pixels (r values)

vpmovzxbd zmm1,xmmword ptr [r14+rax] ;zmm1 = 16 pixels (g values)

vpmovzxbd zmm2,xmmword ptr [r15+rax] ;zmm2 = 16 pixels (b values)

; Convert dword values to SPFP and multiply by coefficients

vcvtdq2ps zmm0,zmm0 ;zmm0 = 16 pixels SPFP (r values)

vcvtdq2ps zmm1,zmm1 ;zmm1 = 16 pixels SPFP (g values)

vcvtdq2ps zmm2,zmm2 ;zmm2 = 16 pixels SPFP (b values)

vmulps zmm0,zmm0,zmm10 ;zmm0 = r values * coef[0]

vmulps zmm1,zmm1,zmm11 ;zmm1 = g values * coef[1]

vmulps zmm2,zmm2,zmm12 ;zmm2 = b values * coef[2]

; Sum color components & clip values to [0.0, 255.0]

vaddps zmm3,zmm0,zmm1 ;r + g

vaddps zmm4,zmm3,zmm2 ;r + g + b

vaddps zmm5,zmm4,zmm13 ;r + g + b + 0.5

vminps zmm0,zmm5,zmm14 ;clip pixels above 255.0

vmaxps zmm1,zmm0,zmm15 ;clip pixels below 0.0

; Convert grayscale values from SPFP to byte, save results

vcvtps2dq zmm2,zmm1 ;convert SPFP values to dwords

vpmovusdb xmm3,zmm2 ;convert to bytes

vmovdqa xmmword ptr [rcx+rax],xmm3 ;save grayscale image pixels

add rax,r10

sub r8,r10

jnz @B

mov eax,1 ;set success return code

Done: vzeroupper

_RestoreXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15

_DeleteFrame r13,r14,r15

ret

Avx512RgbToGs_ endp

; extern "C" bool Avx2RgbToGs_(uint8_t* pb_gs, const uint8_t* const* pb_rgb, int num_pixels, const float coef[3]);

.code

Avx2RgbToGs_ proc frame

_CreateFrame RGBGS1_,0,96,r13,r14,r15

_SaveXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15

_EndProlog

xor eax,eax ;error return code (also pixel_buffer offset)

cmp r8d,[c_NumPixelsMin]

jl Done ;jump if num_pixels < min value

cmp r8d,[c_NumPixelsMax]

jg Done ;jump if num_pixels > max value

test r8d,3fh

jnz Done ;jump if (num_pixels % 64) != 0

test rcx,3fh

jnz Done ;jump if pb_gs is not aligned

mov r13,[rdx]

test r13,3fh

jnz Done ;jump if pb_r is not aligned

mov r14,[rdx+8]

test r14,3fh

jnz Done ;jump if pb_g is not aligned

mov r15,[rdx+16]

test r15,3fh

jnz Done ;jump if pb_b is not aligned

; Perform required initializations

vbroadcastss ymm10,real4 ptr [r9] ;ymm10 = packed coef[0]

vbroadcastss ymm11,real4 ptr [r9+4] ;ymm11 = packed coef[1]

vbroadcastss ymm12,real4 ptr [r9+8] ;ymm12 = packed coef[2]

vbroadcastss ymm13,real4 ptr [r4_0p5] ;ymm13 = packed 0.5

vbroadcastss ymm14,real4 ptr [r4_255p0] ;ymm14 = packed 255.0

vxorps ymm15,ymm15,ymm15 ;ymm15 = packed 0.0

mov r8d,r8d ;r8 = num_pixels

mov r10,8 ;r10 - number of pixels / iteration

; Load next block of pixels

align 16

@@: vpmovzxbd ymm0,qword ptr [r13+rax] ;ymm0 = 8 pixels (r values)

vpmovzxbd ymm1,qword ptr [r14+rax] ;ymm1 = 8 pixels (g values)

vpmovzxbd ymm2,qword ptr [r15+rax] ;ymm2 = 8 pixels (b values)

; Convert dword values to SPFP and multiply by coefficients

vcvtdq2ps ymm0,ymm0 ;ymm0 = 8 pixels SPFP (r values)

vcvtdq2ps ymm1,ymm1 ;ymm1 = 8 pixels SPFP (g values)

vcvtdq2ps ymm2,ymm2 ;ymm2 = 8 pixels SPFP (b values)

vmulps ymm0,ymm0,ymm10 ;ymm0 = r values * coef[0]

vmulps ymm1,ymm1,ymm11 ;ymm1 = g values * coef[1]

vmulps ymm2,ymm2,ymm12 ;ymm2 = b values * coef[2]

; Sum color components & clip values to [0.0, 255.0]

vaddps ymm3,ymm0,ymm1 ;r + g

vaddps ymm4,ymm3,ymm2 ;r + g + b

vaddps ymm5,ymm4,ymm13 ;r + g + b + 0.5

vminps ymm0,ymm5,ymm14 ;clip pixels above 255.0

vmaxps ymm1,ymm0,ymm15 ;clip pixels below 0.0

; Convert grayscale components from SPFP to byte, save results

vcvtps2dq ymm2,ymm1 ;convert SPFP values to dwords

vpackusdw ymm3,ymm2,ymm2

vextracti128 xmm4,ymm3,1

vpackuswb xmm5,xmm3,xmm4 ;byte GS pixels in xmm5[31:0] and xmm5[95:64]

vpextrd r11d,xmm5,0 ;r11d = 4 grayscale pixels

mov dword ptr [rcx+rax],r11d ;save grayscale image pixels

vpextrd r11d,xmm5,2 ;r11d = 4 grayscale pixels

mov dword ptr [rcx+rax+4],r11d ;save grayscale image pixels

add rax,r10

sub r8,r10

jnz @B

mov eax,1 ;set success return code

Done: vzeroupper

_RestoreXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15

_DeleteFrame r13,r14,r15

ret

Avx2RgbToGs_ endp

end

Listing 14-5.

Example Ch14_05

The algorithm that’s used in this example to perform RGB to image grayscale conversion is the same one that was used in Ch10_06. As explained in Chapter 10, the algorithm uses a simple weighted average to transform an RGB image pixel into a grayscale image pixel. The C++ function Avx512RgbToGs begins its execution by loading the test image file. It then copies the RGB pixels of im_rgb into three separate color component image buffers. The reason for doing this is that this example’s RGB to grayscale conversion functions require a structure of arrays (AOS) instead of an array of structures (SOA) , which was employed in source code example Ch10_06. Following allocation of the grayscale image buffers, Avx512RgbToGs invokes the C++ and assembly language conversion functions. The resultant grayscale image buffers are then compared for equality and saved.

The assembly language code in Listing 14-5 includes two functions: Avx512Rgb2Gs_ and Avx2Rgb2Gs_. As implied by their respective name prefixes, these functions perform RGB to grayscale image conversions using AVX-512 and AVX2 instructions, respectively. The function Avx512Rgb2Gs_ begins its execution by validating num_pixels for size and divisibility by 64. It then checks the source and destination pixel buffers for proper alignment. The ensuing series of vbroadcastss instructions load packed versions of the color conversion coefficients into registers ZMM10, ZMM11, and ZMM12. This is followed by another set of vbroadcastss instructions that broadcast the single-precision floating-point constants 0.5, 255.0, and 0.0 to registers ZMM13, ZMM14, and ZMM15. The mov r8d,r8d instruction zero-extends num_pixels into R8, and the mov r10,16 instruction loads R10 with the number of pixels to process during each loop iteration.

Each Avx512Rgb2Gs_ processing loop iteration in starts with three vpmovzxbd instructions that load 16 red, green, and, blue pixel values into registers ZMM0, ZMM1, and ZMM2. The ensuing vcvtdq2ps instructions convert the doubleword pixel values to single-precision floating-point. The floating-point color values are then multiplied by the corresponding color coefficients using a series of vmulps instructions. These values are then summed using three vaddps instructions. The resultant 16 grayscale pixel values are then clipped to [0.0, 255.0] and converted to doubleword values. The vpmovusdb xmm3,zmm2 instruction size-reduces the doubleword values to bytes using unsigned saturation, and the vmovdqa xmmword ptr [rcx+rax],xmm3 instruction saves the 16 byte pixel values to the destination grayscale image buffer.

The assembly language function Avx2Rgb2Gs_ is identical to its AVX-512 counterpart except for two minor changes: Avx2Rgb2Gs_ uses AVX2 instructions and the YMM register set to carry out the required calculations; it also uses the vpackusdw and vpackuswb instructions in conjunction with a few other instructions to perform the doubleword to byte size reductions . The reason for this is that AVX2 does not support the vpmovusdb instruction. Here is the output for source code example Ch14_05:

Grayscale image compare OK

Running benchmark function Avx512RgbToGs_BM - please wait

Benchmark times save to file Ch14_05_Avx512RgbToGs_BM_CHROMIUM.csv

Table 14-2 shows the benchmark timing measurements for source code example Ch14_05.

Table 14-2.

Mean Execution Times (Microseconds) for RGB to Grayscale Image Conversion Using TestImage3.bmp

CPU	Avx512RgbToGsCpp	Avx512Rgb2Gs_	Avx2Rgb2Gs_
i7-4790S	----	----	----
i9-7900X	1125	134	259
i7-8700K	----	----	----

The benchmark time differences between the AVX-512 and AVX2 implementations of the RGB to grayscale conversion algorithm are consistent with what one might expect. It is interesting to compare these numbers with the benchmark timing measurements from source code example Ch10_06 (see Table 10-2). This earlier example used an array of RGB32 pixels (or AOS) for the source image buffer, and the mean execution time for the conversion function Avx2ConvertRgbToGs_ was 593 microseconds. The current example exploits separate image pixel buffers for each color component (or SOA) , which significantly improves performance.

Summary

Here are the key learning points for Chapter 14:

Assembly language functions can use AVX-512 promoted versions of most AVX and AVX2 packed integer instructions to perform operations using 512-, 256-, and 128-bit wide operands.
Assembly language functions can use the vmovdqa[32|64] and vmovdqu[8|16|32|64] to perform aligned and unaligned moves of packed integer operands.
Assembly language functions can use the vpmovus[qd|qw|qb|dw|db|wb] instructions to carry out packed integer size reductions using unsigned saturation. AVX-512 also supports an analogous set of packed integer size-reducing instructions using signed saturation.
The vpcmpu[b|w|d|q] instructions perform packed unsigned integer compare operations and save the resultant compare mask to an opmask register.
The vpand[d|q], vpandn[d|q], vpor[d|q], and vpxor[d|q] instructions can be used with an opmask register to perform merge or zero masking using doubleword or quadword elements.
The vextracti[32x4|32x8|64x2|64x4] instructions can be used to extract packed doubleword or quadword values from a packed integer operand.
When performing SIMD calculations using packed integer or floating-point operands, a structure-of-arrays construct is often significantly faster than an array-of-structures construct.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.

Table of Contents for 14. AVX-512 Programming – Packed Integers

Create new playlist

Sign In

Sign Up

14. AVX-512 Programming – Packed Integers

Basic Arithmetic

Image Processing

Pixel Conversions

Image Thresholding

Image Statistics

RGB to Grayscale Conversion

Summary

Table of Contents for
14. AVX-512 Programming – Packed Integers