© Daniel Kusswurm 2018
Daniel KusswurmModern X86 Assembly Language Programminghttps://doi.org/10.1007/978-1-4842-4063-2_14

14. AVX-512 Programming – Packed Integers

Daniel Kusswurm1 
(1)
Geneva, IL, USA
 

In Chapters 7 and 10, you learned how to use the AVX and AVX2 instruction sets to perform packed integer operations using 128-bit and 256-bit wide operands. In this chapter, you learn how to use AVX-512 instructions set to carry out packed integer operations using 512-bit wide operands. You also learn how to use AVX-512 instructions with 256-bit and 128-bit wide packed integer operands. The first source code example explains how to perform basic packed integer arithmetic using ZMM registers. This is followed by several examples that exemplify image-processing algorithms and techniques using AVX-512 instructions. Like the previous chapter, all of source code examples in this chapter require a processor and operating system that support AVX-512 and the following instruction set extensions: AVX512F , AVX512CD , AVX512BW , AVX512DQ , and AVX512VL . You can use one of the freely available utilities listed in Appendix A to determine whether your system supports these extensions.

Basic Arithmetic

Listing 14-1 shows the source code for example Ch14_01. This example demonstrates how to perform basic packed integer arithmetic using 512-bit wide operands and the ZMM register set.
//------------------------------------------------
//        Ch14_01.cpp
//------------------------------------------------
#include "stdafx.h"
#include <cstdint>
#include <iostream>
#include <iomanip>
#include "Zmmval.h"
using namespace std;
extern "C" void Avx512PackedMathI16_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[6]);
extern "C" void Avx512PackedMathI64_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[5], uint32_t opmask);
void Avx512PackedMathI16(void)
{
  alignas(64) ZmmVal a;
  alignas(64) ZmmVal b;
  alignas(64) ZmmVal c[6];
  a.m_I16[0] = 10;    b.m_I16[0] = 100;
  a.m_I16[1] = 20;    b.m_I16[1] = 200;
  a.m_I16[2] = 30;    b.m_I16[2] = 300;
  a.m_I16[3] = 40;    b.m_I16[3] = 400;
  a.m_I16[4] = 50;    b.m_I16[4] = 500;
  a.m_I16[5] = 60;    b.m_I16[5] = 600;
  a.m_I16[6] = 70;    b.m_I16[6] = 700;
  a.m_I16[7] = 80;    b.m_I16[7] = 800;
  a.m_I16[8] = 1000;   b.m_I16[8] = -100;
  a.m_I16[9] = 2000;   b.m_I16[9] = 200;
  a.m_I16[10] = 3000;  b.m_I16[10] = -300;
  a.m_I16[11] = 4000;  b.m_I16[11] = 400;
  a.m_I16[12] = 5000;  b.m_I16[12] = -500;
  a.m_I16[13] = 6000;  b.m_I16[13] = 600;
  a.m_I16[14] = 7000;  b.m_I16[14] = -700;
  a.m_I16[15] = 8000;  b.m_I16[15] = 800;
  a.m_I16[16] = -1000;  b.m_I16[16] = 100;
  a.m_I16[17] = -2000;  b.m_I16[17] = -200;
  a.m_I16[18] = 3000;  b.m_I16[18] = 303;
  a.m_I16[19] = 4000;  b.m_I16[19] = -400;
  a.m_I16[20] = -5000;  b.m_I16[20] = 500;
  a.m_I16[21] = -6000;  b.m_I16[21] = -600;
  a.m_I16[22] = -7000;  b.m_I16[22] = 700;
  a.m_I16[23] = -8000;  b.m_I16[23] = 800;
  a.m_I16[24] = 30000;  b.m_I16[24] = 3000;   // add overflow
  a.m_I16[25] = 6000;  b.m_I16[25] = 32000;   // add overflow
  a.m_I16[26] = -25000; b.m_I16[26] = -27000;  // add overflow
  a.m_I16[27] = 8000;  b.m_I16[27] = 28700;   // add overflow
  a.m_I16[28] = 2000;  b.m_I16[28] = -31000;  // sub overflow
  a.m_I16[29] = 4000;  b.m_I16[29] = -30000;  // sub overflow
  a.m_I16[30] = -3000;  b.m_I16[30] = 32000;   // sub overflow
  a.m_I16[31] = -15000; b.m_I16[31] = 24000;   // sub overflow
  Avx512PackedMathI16_(&a, &b, c);
  cout <<" Results for Avx512PackedMathI16 ";
  cout << " i    a    b  vpaddw vpaddsw  vpsubw vpsubsw vpminsw vpmaxsw ";
  cout << "-------------------------------------------------------------------------- ";
  for (int i = 0; i < 32; i++)
  {
    cout << setw(2) << i << ' ';
    cout << setw(8) << a.m_I16[i] << ' ';
    cout << setw(8) << b.m_I16[i] << ' ';
    cout << setw(8) << c[0].m_I16[i] << ' ';
    cout << setw(8) << c[1].m_I16[i] << ' ';
    cout << setw(8) << c[2].m_I16[i] << ' ';
    cout << setw(8) << c[3].m_I16[i] << ' ';
    cout << setw(8) << c[4].m_I16[i] << ' ';
    cout << setw(8) << c[5].m_I16[i] << ' ';
  }
}
void Avx512PackedMathI64(void)
{
  alignas(64) ZmmVal a;
  alignas(64) ZmmVal b;
  alignas(64) ZmmVal c[6];
  uint32_t opmask = 0x7f;
  a.m_I64[0] = 64;    b.m_I64[0] = 4;
  a.m_I64[1] = 1024;   b.m_I64[1] = 5;
  a.m_I64[2] = -2048;   b.m_I64[2] = 2;
  a.m_I64[3] = 8192;   b.m_I64[3] = 5;
  a.m_I64[4] = -256;   b.m_I64[4] = 8;
  a.m_I64[5] = 4096;   b.m_I64[5] = 7;
  a.m_I64[6] = 16;    b.m_I64[6] = 3;
  a.m_I64[7] = 512;    b.m_I64[7] = 6;
  Avx512PackedMathI64_(&a, &b, c, opmask);
  cout << " Results for Avx512PackedMathI64 ";
  cout << "op_mask = " << hex << opmask << dec << ' ';
  cout << " i   a   b  vpaddq  vpsubq vpmullq vpsllvq vpsravq  vpabsq ";
  cout << "---------------------------------------------------------------------- ";
  for (int i = 0; i < 8; i++)
  {
    cout << setw(2) << i << ' ';
    cout << setw(6) << a.m_I64[i] << ' ';
    cout << setw(6) << b.m_I64[i] << ' ';
    cout << setw(8) << c[0].m_I64[i] << ' ';
    cout << setw(8) << c[1].m_I64[i] << ' ';
    cout << setw(8) << c[2].m_I64[i] << ' ';
    cout << setw(8) << c[3].m_I64[i] << ' ';
    cout << setw(8) << c[4].m_I64[i] << ' ';
    cout << setw(8) << c[5].m_I64[i] << ' ';
  }
}
int main()
{
  Avx512PackedMathI16();
  Avx512PackedMathI64();
  return 0;
}
;-------------------------------------------------
;        Ch14_01.asm
;-------------------------------------------------
; extern "C" void Avx512PackedMathI16_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[6])
    .code
Avx512PackedMathI16_ proc
    vmovdqu16 zmm0,zmmword ptr [rcx]    ;zmm0 = a
    vmovdqu16 zmm1,zmmword ptr [rdx]    ;zmm1 = b
; Perform packed word operations
    vpaddw zmm2,zmm0,zmm1          ;add
    vmovdqa64 zmmword ptr [r8],zmm2     ;save vpaddw result
    vpaddsw zmm2,zmm0,zmm1         ;add with signed saturation
    vmovdqa64 zmmword ptr [r8+64],zmm2   ;save vpaddsw result
    vpsubw zmm2,zmm0,zmm1          ;sub
    vmovdqa64 zmmword ptr [r8+128],zmm2   ;save vpsubw result
    vpsubsw zmm2,zmm0,zmm1         ;sub with signed saturation
    vmovdqa64 zmmword ptr [r8+192],zmm2   ;save vpsubsw result
    vpminsw zmm2,zmm0,zmm1         ;signed minimums
    vmovdqa64 zmmword ptr [r8+256],zmm2   ;save vpminsw result
    vpmaxsw zmm2,zmm0,zmm1         ;signed maximums
    vmovdqa64 zmmword ptr [r8+320],zmm2   ;save vpmaxsw result
    vzeroupper
    ret
Avx512PackedMathI16_ endp
; extern "C" void Avx512PackedMathI64_(const ZmmVal* a, const ZmmVal* b, ZmmVal c[5], unsigned int opmask)
Avx512PackedMathI64_ proc
    vmovdqa64 zmm0,zmmword ptr [rcx]    ;zmm0 = a
    vmovdqa64 zmm1,zmmword ptr [rdx]    ;zmm1 = b
    and r9d,0ffh              ;r9d = opmask value
    kmovb k1,r9d              ;k1 = opmask
; Perform packed quadword operations
    vpaddq zmm2{k1}{z},zmm0,zmm1      ;add
    vmovdqa64 zmmword ptr [r8],zmm2     ;save vpaddq result
    vpsubq zmm2{k1}{z},zmm0,zmm1      ;sub
    vmovdqa64 zmmword ptr [r8+64],zmm2   ;save vpsubq result
    vpmullq zmm2{k1}{z},zmm0,zmm1      ;signed mul (low 64 bits)
    vmovdqa64 zmmword ptr [r8+128],zmm2   ;save vpmullq result
    vpsllvq zmm2{k1}{z},zmm0,zmm1      ;shift left logical
    vmovdqa64 zmmword ptr [r8+192],zmm2   ;save vpsllvq result
    vpsravq zmm2{k1}{z},zmm0,zmm1      ;shift right arithmetic
    vmovdqa64 zmmword ptr [r8+256],zmm2   ;save vpsravq result
    vpabsq zmm2{k1}{z},zmm0         ;absolute value
    vmovdqa64 zmmword ptr [r8+320],zmm2   ;save vpabsq result
    vzeroupper
    ret
Avx512PackedMathI64_ endp
    end
Listing 14-1.

Example Ch14_01

The C++ functions Avx512PackedMathI16 and Avx512PackedMathI64 are the base routines that handle AVX-512 packed integer operations using word and quadword values. Each function begins its execution by initializing the applicable integer elements of two ZmmVal variables. Note that the C++ alignas(64) specifier is used with each ZmmVal. Following variable initialization, each base routine invokes its corresponding assembly language function: Avx512PackedMathI16_ or Avx512PackedMathI64_. The results are then streamed to cout.

The assembly language function Avx512PackedMathI16_ starts its execution with two vmovdqa64 instructions that load ZmmVal variables a and b into registers ZMM0 and ZMM1, respectively. Somewhat surprisingly, AVX512BW does not include aligned move instructions for 512-bit wide packed byte and word operands. Another alternative here would be to use the vmovdqu16 instruction. Note that this latter instruction must be used in cases where merge or zero masking is required. AVX512BW also includes a vmovdqu8 instruction for 512-bit wide packed byte operands. Following operand value loading, Avx512PackedMathI16_ demonstrates the packed word instructions vpaddw, vpaddsw, vpsubw, vpsubsw, vpminsw, vpmaxsw. Each 512-bit packed word result is then saved in the array c. Note that Avx512PackedMathI16_ uses a vzeroupper instruction prior to its ret instruction.

The assembly language function Avx512PackedMathI64_ exemplifies various arithmetic operations using 512-bit wide packed quadword instructions. Note that this function includes an argument value named opmask, which is employed to highlight packed quadword zero masking . Avx512PackedMathI64_ also uses a vzeroupper instruction prior to its ret instruction. Here are the results for source code example Ch14_01.
Results for Avx512PackedMathI16
 i    a    b  vpaddw vpaddsw  vpsubw vpsubsw vpminsw vpmaxsw
--------------------------------------------------------------------------
 0    10   100   110   110   -90   -90    10   100
 1    20   200   220   220   -180   -180    20   200
 2    30   300   330   330   -270   -270    30   300
 3    40   400   440   440   -360   -360    40   400
 4    50   500   550   550   -450   -450    50   500
 5    60   600   660   660   -540   -540    60   600
 6    70   700   770   770   -630   -630    70   700
 7    80   800   880   880   -720   -720    80   800
 8   1000   -100   900   900   1100   1100   -100   1000
 9   2000   200   2200   2200   1800   1800   200   2000
10   3000   -300   2700   2700   3300   3300   -300   3000
11   4000   400   4400   4400   3600   3600   400   4000
12   5000   -500   4500   4500   5500   5500   -500   5000
13   6000   600   6600   6600   5400   5400   600   6000
14   7000   -700   6300   6300   7700   7700   -700   7000
15   8000   800   8800   8800   7200   7200   800   8000
16  -1000   100   -900   -900  -1100  -1100  -1000   100
17  -2000   -200  -2200  -2200  -1800  -1800  -2000   -200
18   3000   303   3303   3303   2697   2697   303   3000
19   4000   -400   3600   3600   4400   4400   -400   4000
20  -5000   500  -4500  -4500  -5500  -5500  -5000   500
21  -6000   -600  -6600  -6600  -5400  -5400  -6000   -600
22  -7000   700  -6300  -6300  -7700  -7700  -7000   700
23  -8000   800  -7200  -7200  -8800  -8800  -8000   800
24  30000   3000  -32536  32767  27000  27000   3000  30000
25   6000  32000  -27536  32767  -26000  -26000   6000  32000
26  -25000  -27000  13536  -32768   2000   2000  -27000  -25000
27   8000  28700  -28836  32767  -20700  -20700   8000  28700
28   2000  -31000  -29000  -29000  -32536  32767  -31000   2000
29   4000  -30000  -26000  -26000  -31536  32767  -30000   4000
30  -3000  32000  29000  29000  30536  -32768  -3000  32000
31  -15000  24000   9000   9000  26536  -32768  -15000  24000
Results for Avx512PackedMathI64
op_mask = 7f
 i   a   b  vpaddq  vpsubq vpmullq vpsllvq vpsravq  vpabsq
----------------------------------------------------------------------
 0   64   4    68    60   256   1024    4    64
 1  1024   5   1029   1019   5120  32768    32   1024
 2 -2048   2  -2046  -2050  -4096  -8192   -512   2048
 3  8192   5   8197   8187  40960  262144   256   8192
 4  -256   8   -248   -264  -2048  -65536    -1   256
 5  4096   7   4103   4089  28672  524288    32   4096
 6   16   3    19    13    48   128    2    16
 7  512   6    0    0    0    0    0    0

Image Processing

The source code examples in this section explicate image-processing algorithms and techniques using AVX-512 packed integer instructions. Most of the source code examples are updated versions of examples from earlier chapters that exploited AVX or AVX2 instructions. Besides exemplifying AVX-512 packed integer instruction usage, the source code examples that follow also accentuate alternative algorithmic approaches and instruction sequences that often result in improved performance.

Pixel Conversions

In Chapter 7, you learned how to use the AVX instruction set to convert unsigned 8-bit pixels to single-precision floating-point pixels and vice versa (see example Ch07_06). Source code example Ch14_02 demonstrates how to carry out these same conversions using AVX-512 instructions. Listing 14-2 shows the source code for example Ch14_02.
//------------------------------------------------
//        Ch14_02.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include <cstdint>
#include <random>
#include "AlignedMem.h"
using namespace std;
// Ch14_02_Misc.cpp
extern bool Avx512ConvertImgU8ToF32Cpp(float* des, const uint8_t* src, uint32_t num_pixels);
extern bool Avx512ConvertImgF32ToU8Cpp(uint8_t* des, const float* src, uint32_t num_pixels);
extern uint32_t Avx512ConvertImgVerify(const float* src1, const float* src2, uint32_t num_pixels);
extern uint32_t Avx512ConvertImgVerify(const uint8_t* src1, const uint8_t* src2, uint32_t num_pixels);
// Ch14_02_.asm
extern "C" bool Avx512ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels);
extern "C" bool Avx512ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels);
void InitU8(uint8_t* x, uint32_t n, unsigned int seed)
{
  uniform_int_distribution<> ui_dist {0, 255};
  default_random_engine rng {seed};
  for (uint32_t i = 0; i < n; i++)
    x[i] = ui_dist(rng);
}
void InitF32(float* x, uint32_t n, unsigned int seed)
{
  uniform_int_distribution<> ui_dist {0, 1000};
  default_random_engine rng {seed};
  for (uint32_t i = 0; i < n; i++)
    x[i] = (float)ui_dist(rng) / 1000.0f;
}
void Avx512ConvertImgU8ToF32(void)
{
  const size_t align = 64;
  const uint32_t num_pixels = 1024;
  AlignedArray<uint8_t> src_aa(num_pixels, align);
  AlignedArray<float> des1_aa(num_pixels, align);
  AlignedArray<float> des2_aa(num_pixels, align);
  uint8_t* src = src_aa.Data();
  float* des1 = des1_aa.Data();
  float* des2 = des2_aa.Data();
  InitU8(src, num_pixels, 12);
  bool rc1 = Avx512ConvertImgU8ToF32Cpp(des1, src, num_pixels);
  bool rc2 = Avx512ConvertImgU8ToF32_(des2, src, num_pixels);
  cout << " Results for Avx512ConvertImgU8ToF32 ";
  if (!rc1 || !rc2)
  {
    cout << "Invalid return code - ";
    cout << "rc1 = " << boolalpha << rc1 << ", ";
    cout << "rc2 = " << boolalpha << rc2 << ' ';
    return;
  }
  uint32_t num_diff = Avx512ConvertImgVerify(des1, des2, num_pixels);
  cout << " Number of pixel compare errors (num_diff) = " << num_diff << ' ';
}
void Avx512ConvertImgF32ToU8(void)
{
  const size_t align = 64;
  const uint32_t num_pixels = 1024;
  AlignedArray<float> src_aa(num_pixels, align);
  AlignedArray<uint8_t> des1_aa(num_pixels, align);
  AlignedArray<uint8_t> des2_aa(num_pixels, align);
  float* src = src_aa.Data();
  uint8_t* des1 = des1_aa.Data();
  uint8_t* des2 = des2_aa.Data();
  InitF32(src, num_pixels, 20);
  // Test values to demonstrate clipping in conversion functions
  src[0] = 0.5f;     src[8] = 3.33f;
  src[1] = -1.0f;     src[9] = 0.67f;
  src[2] = 0.38f;     src[10] = 0.75f;
  src[3] = 0.62f;     src[11] = 0.95f;
  src[4] = 2.1f;     src[12] = -0.33f;
  src[5] = 0.25f;     src[13] = 0.8f;
  src[6] = -1.25f;    src[14] = 0.12f;
  src[7] = 0.45f;     src[15] = 4.0f;
  bool rc1 = Avx512ConvertImgF32ToU8Cpp(des1, src, num_pixels);
  bool rc2 = Avx512ConvertImgF32ToU8_(des2, src, num_pixels);
  cout << " Results for Avx512ConvertImgF32ToU8 ";
  if (!rc1 || !rc2)
  {
    cout << "Invalid return code - ";
    cout << "rc1 = " << boolalpha << rc1 << ", ";
    cout << "rc2 = " << boolalpha << rc2 << ' ';
    return;
  }
  uint32_t num_diff = Avx512ConvertImgVerify(des1, des2, num_pixels);
  cout << " Number of pixel compare errors (num_diff) = " << num_diff << ' ';
}
int main()
{
  Avx512ConvertImgU8ToF32();
  Avx512ConvertImgF32ToU8();
  return 0;
}
;-------------------------------------------------
;        Ch14_02.asm
;-------------------------------------------------
    include <cmpequ.asmh>
    extern c_NumPixelsMax:dword
      .const
r4_1p0   real4 1.0
r4_255p0  real4 255.0
; extern "C" bool Avx512ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels)
    .code
Avx512ConvertImgU8ToF32_ proc
; Make sure num_pixels is valid and pixel buffers are properly aligned
    xor eax,eax             ;set error return code
    or r8d,r8d
    jz Done               ;jump if num_pixels is zero
    cmp r8d,[c_NumPixelsMax]
    ja Done               ;jump if num_pixels too big
    test r8d,3fh
    jnz Done              ;jump if num_pixels % 64 != 0
    test rcx,3fh
    jnz Done              ;jump if des not aligned
    test rdx,3fh
    jnz Done              ;jump if src not aligned
; Perform required initializations
    shr r8d,6                ;number of blocks (64 pixels/block)
    vmovss xmm0,real4 ptr [r4_1p0]
    vdivss xmm1,xmm0,real4 ptr [r4_255p0]
    vbroadcastss zmm5,xmm1         ;packed scale factor (1.0 / 255.0)
    align 16
@@:   vpmovzxbd zmm0,xmmword ptr [rdx]
    vpmovzxbd zmm1,xmmword ptr [rdx+16]
    vpmovzxbd zmm2,xmmword ptr [rdx+32]
    vpmovzxbd zmm3,xmmword ptr [rdx+48] ;zmm3:zmm0 = 64 U32 pixels
; Convert pixels from uint8_t to float [0.0, 255.0]
    vcvtudq2ps zmm16,zmm0
    vcvtudq2ps zmm17,zmm1
    vcvtudq2ps zmm18,zmm2
    vcvtudq2ps zmm19,zmm3        ;zmm19:zmm16 = 64 F32 pixels
; Normalize pixels to [0.0, 1.0]
    vmulps zmm20,zmm16,zmm5
    vmulps zmm21,zmm17,zmm5
    vmulps zmm22,zmm18,zmm5
    vmulps zmm23,zmm19,zmm5       ;zmm23:zmm20 = 64 F32 pixels (normalized)
; Save F32 pixels to des
    vmovaps zmmword ptr [rcx],zmm20
    vmovaps zmmword ptr [rcx+64],zmm21
    vmovaps zmmword ptr [rcx+128],zmm22
    vmovaps zmmword ptr [rcx+192],zmm23
; Update pointers and counters
    add rdx,64
    add rcx,256
    sub r8d,1
    jnz @B
    mov eax,1            ;set success return code
Done:  vzeroupper
    ret
Avx512ConvertImgU8ToF32_ endp
; extern "C" bool Avx512ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels)
Avx512ConvertImgF32ToU8_ proc
; Make sure num_pixels is valid and pixel buffers are properly aligned
    xor eax,eax             ;set error return code
    or r8d,r8d
    jz Done               ;jump if num_pixels is zero
    cmp r8d,[c_NumPixelsMax]
    ja Done               ;jump if num_pixels too big
    test r8d,3fh
    jnz Done              ;jump if num_pixels % 64 != 0
    test rcx,3fh
    jnz Done              ;jump if des not aligned
    test rdx,3fh
    jnz Done              ;jump if src not aligned
; Perform required initializations
    shr r8d,4              ;number of pixel blocks (16 pixels / block)
    vxorps zmm29,zmm29,zmm29      ;packed 0.0
    vbroadcastss zmm30,[r4_1p0]     ;packed 1.0
    vbroadcastss zmm31,[r4_255p0]    ;packed 255.0
    align 16
@@:   vmovaps zmm0,zmmword ptr [rdx]   ;zmm0 = block of 16 pixels
; Clip pixels in current block to [0,0. 1.0]
    vcmpps k1,zmm0,zmm29,CMP_GE     ;k1 = mask of pixels >= 0.0
    vmovaps zmm1{k1}{z},zmm0      ;all pixels >= 0.0
    vcmpps k2,zmm1,zmm30,CMP_GT     ;k2 = mask of pixels > 1.0
    vmovaps zmm1{k2},zmm30       ;all pixels clipped to [0.0, 1.0]
; Convert pixels to uint8_t and save to des
    vmulps zmm2,zmm1,zmm31       ;all pixels [0.0, 255.0]
    vcvtps2udq zmm3,zmm2{ru-sae}    ;all pixels [0, 255]
    vpmovusdb xmmword ptr [rcx],zmm3  ;save pixels as unsigned bytes
; Update pointers and counters
    add rdx,64
    add rcx,16
    sub r8d,1
    jnz @B
    mov eax,1              ;set success return code
Done:  vzeroupper
    ret
Avx512ConvertImgF32ToU8_ endp
    end
Listing 14-2.

Example Ch14_02

The C++ code in Listing 14-2 begins with the requisite function declarations. The first declaration set is for the functions Avx512ConvertImgU8ToF32Cpp and Avx512ConvertImgU8ToF32Cpp, which are defined in the file Ch14_02_Misc.cpp. The source code for these functions is not shown since they’re almost identical to the AVX2 counterpart functions that were used in source code example Ch07_06. Two minor changes were made: the source and destination pixel buffers are aligned on a 64-byte instead of a 16-byte boundary; the number of pixels in these buffers must be evenly divisible by 64 instead of 32.

The function Avx512ConvertImgU8ToF32 initializes the test arrays for converting pixels values from uint8_t to float. This function uses the C++ template class AlignedArray<> to allocate these arrays on a 64-byte boundary. Following test array initialization, Avx512ConvertImgU8ToF32 invokes the C++ and assembly language conversion functions. It then calls Avx512ConvertImgVerify to verify the results. The function Avx512ConvertImgF32ToU8 converts pixel values from float to uint8_t. Note that this function intentionally initializes the first few values of the source pixel buffer src to known values in order to verify that the conversion functions properly clip out-of-range pixel values.

The assembly language function Avx512ConvertImgU8ToF32_ begins its execution by validating num_pixels. It then confirms that the pixel buffers src and des are properly aligned on a 64-byte boundary. In source code example Ch07_06 from Chapter 7, pixel normalization was performed by dividing each pixel value by 255.0. Avx512ConvertImgU8ToF32_ carries out pixel normalization using the multiplicative scale factor 1.0/255.0 since floating-point multiplication is usually faster than floating-point division. The vbroadcastss zmm5,xmm1 instruction loads a packed version of this scale factor into register ZMM5.

Each processing loop iteration starts with a vpmovzxbd zmm0,xmmword ptr [rdx] instruction. This instruction copies and zero-extends the 16-byte (or uint8_t) pixels pointed to by RDX to doublewords ; it then saves these values in register ZMM0. Three more vpmovzxbd instructions are then employed to load another 48 pixels into registers ZMM1, ZMM2, and ZMM3. This is followed by four vcvtudq2ps instructions that convert each unsigned doubleword pixel value in registers ZMM0–ZMM3 to single-precision floating-point. The ensuing vmulps instructions multiply these values by the normalization scale factor; the results are then saved to the destination pixel buffer des using a series of vmovaps instructions.

In source code example Ch07_06, all floating-point pixel values were clipped to [0.0, 1.0] before being converted to uint8_t values. The function Avx512ConvertImgF32ToU8_ also performs this same operation. Following its argument validation checks, Avx512ConvertImgF32ToU8_ loads registers ZMM29, ZMM30, and ZMM31 with packed versions of the single-precision floating-point constants 0.0, 1.0, and 255.0, respectively. The processing loop of Avx512ConvertImgF32ToU8_ begins each iteration with a vmovaps zmm0,zmmword ptr [rdx] that loads a block of 16 single-precision floating-point pixels into register ZMM0. The ensuing vcmpps k1,zmm0,zmm29,CMP_GE instruction compares each pixel element in ZMM0 to 0.0 and saves the resultant compare mask in opmask register K1. The next instruction, vmovaps zmm1{k1}{z},zmm0, uses zero masking to eliminate all pixel values less than 0.0. Figure 14-1 illustrates these operations.
../images/326959_2_En_14_Chapter/326959_2_En_14_Fig1_HTML.jpg
Figure 14-1.

Instruction sequence used to convert packed pixel values from floating-point to unsigned doubleword integers

The subsequent vcmpps k2,zmm1,zmm30,CMP_GT instruction creates a mask of pixel values greater than 1.0 and saves this mask in opmask register K2. Following execution of the vmovaps zmm1{k2},zmm30 instruction, all of the pixel values in register ZMM1 are greater than or equal to 0.0 and less than or equal to 1.0. The next two instructions, vmulps zmm2,zmm1,zmm31 and vcvtps2udq zmm3,zmm2{ru-sae}, convert the normalized floating-point pixel values to unsigned doubleword integers. Note that the vcvtps2udq instruction employs an instruction-level rounding control operand (round up) primarily for demonstration purposes. The ensuing vpmovusdb xmmword ptr [rcx],zmm3 instruction size-reduces the doubleword values to bytes using unsigned saturation and saves them in the destination buffer pointed to by RCX. Here are the results for source code example Ch14_02:
Results for Avx512ConvertImgU8ToF32
 Number of pixel compare errors (num_diff) = 0
Results for Avx512ConvertImgF32ToU8
 Number of pixel compare errors (num_diff) = 0

Image Thresholding

In source code example Ch07_08, you learned about image thresholding and how to create a binary (or two color) mask image . Briefly, thresholding is an image-processing technique that sets a mask image pixel to 0xff to signify that the intensity value of the corresponding pixel in a grayscale image is greater than a pre-determined threshold intensity value; otherwise, the mask image pixel is set to 0x00. The next source code example, Ch14_03, expands the image-thresholding technique that was used in Ch07_08 to support multiple compare operators. Listing 14-3 shows the source code for example Ch14_03.
//------------------------------------------------
//        Ch14_03.h
//------------------------------------------------
#pragma once
#include <cstdint>
// Compare operators
enum CmpOp { EQ, NE, LT, LE, GT, GE };
// Ch14_03_Misc.cpp
extern void Init(uint8_t* x, size_t n, unsigned int seed);
extern void ShowResults(const uint8_t* des1, const uint8_t* des2, size_t num_pixels, CmpOp cmp_op,
  uint8_t cmp_val, size_t test_id);
// Ch14_03_.asm
extern "C" bool Avx512ComparePixels_(uint8_t* des, const uint8_t* src, size_t num_pixels,
  CmpOp cmp_op, uint8_t cmp_val);
//------------------------------------------------
//        Ch14_03.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <cassert>
#include "Ch14_03.h"
#include "AlignedMem.h"
using namespace std;
extern "C" const size_t c_NumPixelsMax = 16777216;
bool Avx512ComparePixelsCpp(uint8_t* des, const uint8_t* src, size_t num_pixels, CmpOp cmp_op, uint8_t cmp_val)
{
  // Make sure num_pixels is valid
  if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))
    return false;
  if ((num_pixels & 0x3f) != 0)
    return false;
  // Make sure src and des are aligned on a 64-byte boundary
  if (!AlignedMem::IsAligned(src, 64))
    return false;
  if (!AlignedMem::IsAligned(des, 64))
    return false;
  bool rc = true;
  const uint8_t cmp_false = 0x00;
  const uint8_t cmp_true = 0xff;
  switch (cmp_op)
  {
    case CmpOp::EQ:
      for (size_t i = 0; i < num_pixels; i++)
        des[i] = (src[i] == cmp_val) ? cmp_true : cmp_false;
      break;
    case CmpOp::NE:
      for (size_t i = 0; i < num_pixels; i++)
        des[i] = (src[i] != cmp_val) ? cmp_true : cmp_false;
      break;
    case CmpOp::LT:
      for (size_t i = 0; i < num_pixels; i++)
        des[i] = (src[i] < cmp_val) ? cmp_true : cmp_false;
      break;
    case CmpOp::LE:
      for (size_t i = 0; i < num_pixels; i++)
        des[i] = (src[i] <= cmp_val) ? cmp_true : cmp_false;
      break;
    case CmpOp::GT:
      for (size_t i = 0; i < num_pixels; i++)
        des[i] = (src[i] > cmp_val) ? cmp_true : cmp_false;
      break;
    case CmpOp::GE:
      for (size_t i = 0; i < num_pixels; i++)
        des[i] = (src[i] >= cmp_val) ? cmp_true : cmp_false;
      break;
    default:
      cout << "Invalid CmpOp: " << cmp_op << ' ';
      rc = false;
  }
  return rc;
}
int main()
{
  const size_t align = 64;
  const size_t num_pixels = 4 * 1024 * 1024;
  AlignedArray<uint8_t> src_aa(num_pixels, align);
  AlignedArray<uint8_t> des1_aa(num_pixels, align);
  AlignedArray<uint8_t> des2_aa(num_pixels, align);
  uint8_t* src = src_aa.Data();
  uint8_t* des1 = des1_aa.Data();
  uint8_t* des2 = des2_aa.Data();
  const uint8_t cmp_vals[] {197, 222, 43, 43, 129, 222};
  const CmpOp cmp_ops[] {CmpOp::EQ, CmpOp::NE, CmpOp::LT, CmpOp::LE, CmpOp::GT, CmpOp::GE};
  const size_t num_cmp_vals = sizeof(cmp_vals) / sizeof(uint8_t);
  const size_t num_cmp_ops = sizeof(cmp_ops) / sizeof(CmpOp);
  assert(num_cmp_vals == num_cmp_ops);
  Init(src, num_pixels, 511);
  cout << "Results for Ch14_03 ";
  for (size_t i = 0; i < num_cmp_ops; i++)
  {
    Avx512ComparePixelsCpp(des1, src, num_pixels, cmp_ops[i], cmp_vals[i]);
    Avx512ComparePixels_(des2, src, num_pixels, cmp_ops[i], cmp_vals[i]);
    ShowResults(des1, des2, num_pixels, cmp_ops[i], cmp_vals[i], i + 1);
  }
  return 0;
}
;-------------------------------------------------
;        Ch14_03.asm
;-------------------------------------------------
    include <cmpequ.asmh>
    extern c_NumPixelsMax:qword
; Macro CmpPixels
_CmpPixels macro CmpOp
    align 16
@@:   vmovdqa64 zmm0,zmmword ptr [rdx+rax]  ;load next block of 64 pixels
    vpcmpub k1,zmm0,zmm4,CmpOp       ;perform compare operation
    vmovdqu8 zmm1{k1}{z},zmm5        ;set mask pixels to 0 or 255 using opmask
    vmovdqa64 zmmword ptr [rcx+rax],zmm1  ;save mask pixels
    add rax,64               ;update offset
    sub r8,64
    jnz @B                 ;repeat until done
    mov eax,1                ;set success return code
    vzeroupper
    ret
    endm
; extern "C" bool Avx512ComparePixels_(uint8_t* des, const uint8_t* src,
;  size_t num_pixels, CmpOp cmp_op, uint8_t cmp_val);
    .code
Avx512ComparePixels_ proc
; Make sure num_pixels is valid and pixel buffers are properly aligned
    xor eax,eax             ;set error code (also array offset)
    or r8,r8
    jz Done               ;jump if num_pixels is zero
    cmp r8,[c_NumPixelsMax]
    ja Done               ;jump if num_pixels too big
    test r8,3fh
    jnz Done              ;jump if num_pixels % 64 != 0
    test rcx,3fh
    jnz Done              ;jump if des not aligned
    test rdx,3fh
    jnz Done              ;jump if src not aligned
; Perform required initializations
    vpbroadcastb zmm4,byte ptr [rsp+40] ;zmm4 = packed cmp_val
    mov r10d,255
    vpbroadcastb zmm5,r10d       ;zmm5 = packed 255
; Perform specified compare operation
    cmp r9d,0
    jne LB_NE
    _CmpPixels CMP_EQ          ;CmpOp::EQ
LB_NE: cmp r9d,1
    jne LB_LT
    _CmpPixels CMP_NEQ         ;CmpOp::NE
LB_LT: cmp r9d,2
    jne LB_LE
    _CmpPixels CMP_LT          ;CmpOp::LT
LB_LE: cmp r9d,3
    jne LB_GT
    _CmpPixels CMP_LE          ;CmpOp::LE
LB_GT: cmp r9d,4
    jne LB_GE
    _CmpPixels CMP_NLE         ;CmpOp::GT
LB_GE: cmp r9d,5
    jne Done
    _CmpPixels CMP_NLT         ;CmpOp::GE
Done:  vzeroupper
    ret
Avx512ComparePixels_ endp
    end
Listing 14-3.

Example Ch14_03

Near the top of the header file Ch14_03.h is an enum named CmpOp, which contains identifiers for the common compare operations. This is followed by the example’s function declarations. The C++ functions Init and ShowResults are ancillary functions that perform test array initialization and display results. The source code for these functions is not shown in Listing 14-3 but included with the chapter download package. The function Avx512ComparePixels_ is an AVX-512 assembly language functions that implements the pixel thresholding algorithm.

The function Avx512ComparePixelsCpp contains the C++ implementation of the updated thresholding algorithm. This function begins its execution by validating num_pixels for size and divisibility by 64. It then verifies that the pixel buffers src and des are properly aligned on a 64-byte boundary. Following argument validation code is a switch statement that applies the selector cmp_op to select a compare operation. Each switch statement case code block is a simple for loop that compares src[i] against cmp_val using the specified operator and sets pixels in the mask image to 0xff (true compare) or 0x00 (false compare). The function main includes code that allocates the image pixel buffers, exercises the functions Avx512ComparePixelsCpp and Avx512ComparePixels_ using various compare operators and displays results.

The assembly language code in Listing 14-3 commences with the macro _CmpPixels. This macro generates AVX-512 code that implements a processing loop for a pixel compare operator. The macro _CmpPixels requires the following register initializations prior to its use: RAX = 0, RCX = mask image pixel buffer, RDX = grayscale image pixel buffer, R8 = number of pixels, ZMM4 = packed byte threshold values, and ZMM5 = packed 0xff byte values. Each processing loop iteration of _CmpPixels begins with a vmovdqa64 zmm0,zmmword ptr [rdx+rax] instruction that loads 64 unsigned 8-bit integers into register ZMM0. The next instruction, vpcmpub k1,zmm0,zmm4,CmpOp, compares the grayscale pixel intensity values in ZMM0 to the packed values in ZMM4; it then saves the resultant mask in opmask register K1. The ensuing vmovdqu8 zmm1{k1}{z},zmm5 instruction sets each mask pixel value in ZMM1 to 0xff (true compare) or 0x00 (false compare) according to the value of the corresponding bit position in K1. The instruction vmovdqa64 zmmword ptr [rcx+rax],zmm1 then saves the 64 mask pixels to the mask image pixel buffer.

The function Avx512ComparePixels_ employs the macro _CmpPixels to implement the same algorithm as its C++ counterpart Avx512ComparePixelsCpp. Follow the requisite argument validation checks, a vpbroadcastb zmm4,byte ptr [rsp+40] instruction broadcasts cmp_val to each byte element in register ZMM4. The next two instructions, mov r10d,255 and vpbroadcastb zmm5,r10d, load the value 0xff into each byte element of ZMM5. The remaining code in Avx512ComparePixels_ uses the argument value cmp_val to implement an ad hoc switch statement that takes advantage of the macro _CmpPixels. Note that this function uses the compare equates CMP_NLE (not less than or equal) or CMP_NLT (not less than) for the _CmpPixels macro argument CmpOp instead of CMP_GT or CMP_GE. The reason for this is that the vpcmpub instruction in _CmpPixels does not support use of the CMP_GT and CMP_GE equates (mathematically these latter equates are equivalent to CMP_NLE and CMP_NLT, but are assigned different values in cmpequ.asmh). Here are the results for source code example Ch14_03:
Results for Ch14_03
Test #1
 num_pixels: 4194304
 cmp_op:   EQ
 cmp_val:  197
 Pixel masks are identical
 Number of non-zero mask pixels = 16424
Test #2
 num_pixels: 4194304
 cmp_op:   NE
 cmp_val:  222
 Pixel masks are identical
 Number of non-zero mask pixels = 4177927
Test #3
 num_pixels: 4194304
 cmp_op:   LT
 cmp_val:  43
 Pixel masks are identical
 Number of non-zero mask pixels = 703652
Test #4
 num_pixels: 4194304
 cmp_op:   LE
 cmp_val:  43
 Pixel masks are identical
 Number of non-zero mask pixels = 719787
Test #5
 num_pixels: 4194304
 cmp_op:   GT
 cmp_val:  129
 Pixel masks are identical
 Number of non-zero mask pixels = 2065724
Test #6
 num_pixels: 4194304
 cmp_op:   GE
 cmp_val:  222
 Pixel masks are identical
 Number of non-zero mask pixels = 556908

Image Statistics

Listing 14-4 shows the source code for example Ch14_04. This example illustrates how to calculate the mean and standard deviation of a grayscale image using its pixel intensity values. In order to make source code example Ch14_04 a little more interesting, the C++ and assembly language functions use only the pixel values that reside between two threshold limits. Pixel values outside of these limits are excluded from any mean and standard deviation calculations.
//------------------------------------------------
//        Ch14_04.h
//------------------------------------------------
#pragma once
#include <cstdint>
// This structure must match the structure that's defined in Ch14_04_.asm.
struct ImageStats
{
  uint8_t* m_PixelBuffer;
  uint64_t m_NumPixels;
  uint32_t m_PixelValMin;
  uint32_t m_PixelValMax;
  uint64_t m_NumPixelsInRange;
  uint64_t m_PixelSum;
  uint64_t m_PixelSumOfSquares;
  double m_PixelMean;
  double m_PixelSd;
};
// Ch14_04.cpp
extern bool Avx512CalcImageStatsCpp(ImageStats& im_stats);
// Ch14_04_.asm
extern "C" bool Avx512CalcImageStats_(ImageStats& im_stats);
// Ch04_04_BM.cpp
extern void Avx512CalcImageStats_BM(void);
// Common constants
const uint32_t c_PixelValMin = 40;
const uint32_t c_PixelValMax = 230;
//------------------------------------------------
//        Ch14_04.cpp
//------------------------------------------------
#include "stdafx.h"
#include <cstdint>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
#include <stdexcept>
#include "Ch14_04.h"
#include "AlignedMem.h"
#include "ImageMatrix.h"
using namespace std;
extern "C" uint64_t c_NumPixelsMax = 256 * 1024;
bool Avx512CalcImageStatsCpp(ImageStats& im_stats)
{
  uint64_t num_pixels = im_stats.m_NumPixels;
  const uint8_t* pb = im_stats.m_PixelBuffer;
  // Perform validation checks
  if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))
    return false;
  if (!AlignedMem::IsAligned(pb, 64))
    return false;
  // Calculate intermediate sums
  im_stats.m_PixelSum = 0;
  im_stats.m_PixelSumOfSquares = 0;
  im_stats.m_NumPixelsInRange = 0;
  for (size_t i = 0; i < num_pixels; i++)
  {
    uint32_t pval = pb[i];
    if (pval >= im_stats.m_PixelValMin && pval <= im_stats.m_PixelValMax)
    {
      im_stats.m_PixelSum += pval;
      im_stats.m_PixelSumOfSquares += pval * pval;
      im_stats.m_NumPixelsInRange++;
    }
  }
  // Calculate mean and standard deviation
  double temp0 = (double)im_stats.m_NumPixelsInRange * im_stats.m_PixelSumOfSquares;
  double temp1 = (double)im_stats.m_PixelSum * im_stats.m_PixelSum;
  double var_num = temp0 - temp1;
  double var_den = (double)im_stats.m_NumPixelsInRange * (im_stats.m_NumPixelsInRange - 1);
  double var = var_num / var_den;
  im_stats.m_PixelMean = (double)im_stats.m_PixelSum / im_stats.m_NumPixelsInRange;
  im_stats.m_PixelSd = sqrt(var);
  return true;
}
void Avx512CalcImageStats()
{
  const wchar_t* image_fn = L"..\Ch14_Data\TestImage4.bmp";
  ImageStats is1, is2;
  ImageMatrix im(image_fn);
  uint64_t num_pixels = im.GetNumPixels();
  uint8_t* pb = im.GetPixelBuffer<uint8_t>();
  is1.m_PixelBuffer = pb;
  is1.m_NumPixels = num_pixels;
  is1.m_PixelValMin = c_PixelValMin;
  is1.m_PixelValMax = c_PixelValMax;
  is2.m_PixelBuffer = pb;
  is2.m_NumPixels = num_pixels;
  is2.m_PixelValMin = c_PixelValMin;
  is2.m_PixelValMax = c_PixelValMax;
  const char nl = ' ';
  const char* s = " | ";
  const unsigned int w1 = 22;
  const unsigned int w2 = 12;
  cout << fixed << setprecision(6) << left;
  wcout << fixed << setprecision(6) << left;
  cout << " Results for Avx512CalcImageStats ";
  wcout << setw(w1) << "image_fn:" << setw(w2) << image_fn << nl;
  cout << setw(w1) << "num_pixels:" << setw(w2) << num_pixels << nl;
  cout << setw(w1) << "c_PixelValMin:" << setw(w2) << c_PixelValMin << nl;
  cout << setw(w1) << "c_PixelValMax:" << setw(w2) << c_PixelValMax << nl;
  bool rc1 = Avx512CalcImageStatsCpp(is1);
  bool rc2 = Avx512CalcImageStats_(is2);
  if (!rc1 || !rc2)
  {
    cout << "Bad return code ";
    cout << " rc1 = " << rc1 << ' ';
    cout << " rc2 = " << rc2 << ' ';
    return;
  }
  cout << nl;
  cout << setw(w1) << "m_NumPixelsInRange: ";
  cout << setw(w2) << is1.m_NumPixelsInRange << s;
  cout << setw(w2) << is2.m_NumPixelsInRange << nl;
  cout << setw(w1) << "m_PixelSum:";
  cout << setw(w2) << is1.m_PixelSum << s;
  cout << setw(w2) << is2.m_PixelSum << nl;
  cout << setw(w1) << "m_PixelSumOfSquares:";
  cout << setw(w2) << is1.m_PixelSumOfSquares << s;
  cout << setw(w2) << is2.m_PixelSumOfSquares << nl;
  cout << setw(w1) << "m_PixelMean:";
  cout << setw(w2) << is1.m_PixelMean << s;
  cout << setw(w2) << is2.m_PixelMean << nl;
  cout << setw(w1) << "m_PixelSd:";
  cout << setw(w2) << is1.m_PixelSd << s;
  cout << setw(w2) << is2.m_PixelSd << nl;
}
int main()
{
  try
  {
    Avx512CalcImageStats();
    Avx512CalcImageStats_BM();
  }
  catch (runtime_error& rte)
  {
    cout << "'runtime_error' exception has occurred - " << rte.what() << ' ';
  }
  catch (...)
  {
    cout << "Unexpected exception has occurred ";
    cout << "File = " << __FILE__ << ' ';
  }
  return 0;
}
;-------------------------------------------------
;        Ch14_04.asm
;-------------------------------------------------
    include <cmpequ.asmh>
    include <MacrosX86-64-AVX.asmh>
    extern c_NumPixelsMax:qword
; This structure must match the structure that's defined in Ch14_04.h
ImageStats     struct
PixelBuffer     qword ?
NumPixels      qword ?
PixelValMin     dword ?
PixelValMax     dword ?
NumPixelsInRange  qword ?
PixelSum      qword ?
PixelSumOfSquares  qword ?
PixelMean      real8 ?
PixelSd       real8 ?
ImageStats     ends
_UpdateSums macro Disp
    vpmovzxbd zmm0,xmmword ptr [rcx+Disp]  ;zmm0 = 16 pixels
    vpcmpud k1,zmm0,zmm31,CMP_GE      ;k1 = mask of pixels >= pixel_val_min
    vpcmpud k2,zmm0,zmm30,CMP_LE      ;k2 = mask of pixels <= pixel_val_max
    kandw k3,k2,k1             ;k3 = mask of in-range pixels
    vmovdqa32 zmm1{k3}{z},zmm0       ;zmm1 = in-range pixels
    vpaddd zmm16,zmm16,zmm1         ;update packed pixel_sum
    vpmulld zmm2,zmm1,zmm1
    vpaddd zmm17,zmm17,zmm2         ;update packed pixel_sum_of_squares
    kmovw rax,k3
    popcnt rax,rax             ;count number of in-range pixels
    add r10,rax               ;update num_pixels_in_range
    endm
; extern "C" bool Avx512CalcImageStats_(ImageStats& im_stats);
    .code
Avx512CalcImageStats_ proc frame
    _CreateFrame CIS_,0,0,rsi,r12,r13
    _EndProlog
; Make sure num_pixels is valid and pixel_buff is properly aligned
    xor eax,eax             ;set error return code
    mov rsi,rcx                  ;rsi = im_stats ptr
    mov rcx,qword ptr [rsi+ImageStats.PixelBuffer] ;rcx = pixel buffer ptr
    mov rdx,qword ptr [rsi+ImageStats.NumPixels]  ;rdx = num_pixels
    test rdx,rdx
    jz Done               ;jump if num_pixels is zero
    cmp rdx,[c_NumPixelsMax]
    ja Done               ;jump if num_pixels too big
    test rcx,3fh
    jnz Done              ;jump if pixel_buff misaligned
; Perform required initializations
    mov r8d,dword ptr [rsi+ImageStats.PixelValMin]
    mov r9d,dword ptr [rsi+ImageStats.PixelValMax]
    vpbroadcastd zmm31,r8d       ;packed pixel_val_min
    vpbroadcastd zmm30,r9d       ;packed pixel_val_max
    vpxorq zmm29,zmm29,zmm29      ;packed pixel_sum
    vpxorq zmm28,zmm28,zmm28      ;packed pixel_sum_of_squares
    xor r10d,r10d            ;num_pixels_in_range = 0
; Compute packed versions of pixel_sum and pixel_sum_of_squares
    cmp rdx,64
    jb LB1               ;jump if there are fewer than 64 pixels
    align 16
@@:   vpxord zmm16,zmm16,zmm16      ;loop packed pixel_sum = 0
    vpxord zmm17,zmm17,zmm17      ;loop packed pixel_sum_of_squares = 0
    _UpdateSums 0            ;process pixel_buff[i+15]:pixel_buff[i]
    _UpdateSums 16           ;process pixel_buff[i+31]:pixel_buff[i+16]
    _UpdateSums 32           ;process pixel_buff[i+47]:pixel_buff[i+32]
    _UpdateSums 48           ;process pixel_buff[i+63]:pixel_buff[i+48]
    vextracti32x8 ymm0,zmm16,1     ;extract top 8 pixel_sum (dwords)
    vpaddd ymm1,ymm0,ymm16
    vpmovzxdq zmm2,ymm1
    vpaddq zmm29,zmm29,zmm2       ;update packed pixel_sum (qwords)
    vextracti32x8 ymm0,zmm17,1     ;extract top 8 pixel_sum_of_squares (dwords)
    vpaddd ymm1,ymm0,ymm17
    vpmovzxdq zmm2,ymm1
    vpaddq zmm28,zmm28,zmm2       ;update packed pixel_sum_of_squares (qwords)
    add rcx,64             ;update pb ptr
    sub rdx,64             ;update num_pixels
    cmp rdx,64
    jae @B               ;repeat until done
    align 16
LB1:  test rdx,rdx
    jz LB3               ;jump if no more pixels remain
    xor r13,r13             ;pixel_sum = 0
    xor r12,r12             ;pixel_sum_of_squares = 0
    mov r11,rdx             ;number of remaining pixels
@@:   movzx rax,byte ptr [rcx]      ;load next pixel
    cmp rax,r8
    jb LB2               ;jump if current pixel < pval_min
    cmp rax,r9
    ja LB2               ;jump if current pixel > pval_max
    add r13,rax             ;add to pixel_sum
    mul rax
    add r12,rax             ;add to pixel_sum_of_squares
    add r10,1              ;update num_pixels_in_range
LB2:  add rcx,1
    sub r11,1
    jnz @B               ;repeat until done
; Save num_pixel_in_range
LB3:  mov qword ptr [rsi+ImageStats.NumPixelsInRange],r10
; Reduce packed pixel_sum to single qword
    vextracti64x4 ymm0,zmm29,1
    vpaddq ymm1,ymm0,ymm29
    vextracti64x2 xmm2,ymm1,1
    vpaddq xmm3,xmm2,xmm1
    vpextrq rax,xmm3,0
    vpextrq r11,xmm3,1
    add rax,r11             ;rax = sum of qwords in zmm29
    add r13,rax             ;add scalar pixel_sum
    mov qword ptr [rsi+ImageStats.PixelSum],r13
;Reduce packed pixel_sum_of_squares to single qword
    vextracti64x4 ymm0,zmm28,1
    vpaddq ymm1,ymm0,ymm28
    vextracti64x2 xmm2,ymm1,1
    vpaddq xmm3,xmm2,xmm1
    vpextrq rax,xmm3,0
    vpextrq r11,xmm3,1
    add rax,r11             ;rax = sum of qwords in zmm28
    add r12,rax             ;add scalar pixel_sum_of_squares
    mov qword ptr [rsi+ImageStats.PixelSumOfSquares],r12
; Calculate final mean and sd
    vcvtusi2sd xmm0,xmm0,r10      ;num_pixels_in_range (DPFP)
    sub r10,1
    vcvtusi2sd xmm1,xmm1,r10      ;num_pixels_in_range - 1 (DPFP)
    vcvtusi2sd xmm2,xmm2,r13      ;pixel_sum (DPFP)
    vcvtusi2sd xmm3,xmm3,r12      ;pixel_sum_of_squares (DPFP)
    vdivsd xmm4,xmm2,xmm0        ;final pixel_mean
    vmovsd real8 ptr [rsi+ImageStats.PixelMean],xmm4
    vmulsd xmm4,xmm0,xmm3        ;num_pixels_in_range * pixel_sum_of_squares
    vmulsd xmm5,xmm2,xmm2        ;pixel_sum * pixel_sum
    vsubsd xmm2,xmm4,xmm5        ;var_num
    vmulsd xmm3,xmm0,xmm1        ;var_den
    vdivsd xmm4,xmm2,xmm3        ;calc variance
    vsqrtsd xmm0,xmm0,xmm4       ;final pixel_sd
    vmovsd real8 ptr [rsi+ImageStats.PixelSd],xmm0
    mov eax,1              ;set success return code
Done:  vzeroupper
    _DeleteFrame rsi,r12,r13
    ret
Avx512CalcImageStats_ endp
    end
Listing 14-4.

Example Ch14_04

The mean and standard deviation of the pixels in a grayscale image can be calculated using the following equations:
$$ overline{x}=frac{1}{n}sum limits_i{x}_i $$
$$ s=sqrt{frac{nsum limits_i{x}_i^2-{left(sum limits_i{x}_i
ight)}^2}{n;left(n-1
ight)}} $$

In the mean and standard deviation equations, the symbol x i represents an image buffer pixel and n denotes the number of pixels. If you study these equations carefully, you will notice that two intermediate sums must be calculated: the sum of all pixels and the sum of all pixel values squared. Once these quantities are known, the mean and standard deviation can be determined using simple arithmetic. The standard deviation equation that’s detailed here is simple to calculate and suitable for this source code example. For other use cases, however, this same equation is often unsuitable for standard deviation calculations especially those that involve floating-point values. You may want to consult the statistical variance calculating references that are listed in Appendix A before using this equation in one of your own programs.

Listing 14-4 begins with the C++ header file Ch14_04.h that includes the declaration of a structure named ImageStats. This structure is used to pass image data to the C++ and assembly language calculating functions and return results. A semantically equivalent structure is also defined in the assembly language file Ch14_04_.asm. The file Ch14_04.h also includes the constant definitions c_PixelValMin and c_PixelValueMax, which define the range limits that a pixel value must fall between to be included in any statistical calculations.

The function Avx512CalcImageStatsCpp is the principal calculating function in the C++ code. This function requires a pointer to an ImageStats structure as its sole argument. Following argument validation, Avx512CalcImageStatsCpp initializes the ImageStats intermediate sums m_PixelSum, m_PixelSumOfSquares, and m_NumPixelsInRange to zero. A simple for loop follows, which calculates m_PixelSum and m_PixelSumOfSquares. During each loop iteration, pixel values are tested for in-range validity before being included in any calculations. Following computation of the intermediate sums, the function Avx512CalcImageStatsCpp calculates the final mean and standard deviation . Note that m_NumPixelsInRange is used to calculate these statistical quantities instead of m_NumPixels. The remaining code in Ch14_04.cpp performs test case initialization, invokes the calculating functions, and streams the results to cout.

Toward the top of the file Ch14_04_.asm is the assembly language version of the structure ImageStats. This is followed by the macro definition _UpdateSums whose inner workings will be described shortly. The function Avx512CalcImageStats_ begins its execution by performing the same argument validation checks as its C++ counterpart. It then initializes packed versions of the intermediate values PixelValMin and PixelValMax. The ensuing vpxorq instructions initialize packed quadword versions of PixelSum and PixelSumOfSquares to zero. Note that the vpxor[d|q] (and other AVX-512 bitwise Boolean) instructions can optionally specify an opmask operand register to perform merge or zero masking of doubleword or quadword elements. The final initialization instruction, xor r10d,r10d, sets NumPixelsInRange to zero.

The processing loop in function Avx512CalcImageStats_ processes 64 pixels each iteration. Prior to the start of the processing loop, register RDX is tested to verify that at least 64 pixels remain. Each processing loop iteration begins with two vpxord instructions that initialize packed doubleword versions of pixel_sum and pixel_sum_of_squares to zero. Following this are four instances of the macro _UpdateSum, which process in aggregate the next group of 64 pixels. The first instruction of this macro, vpmovzxbd zmm0,xmmword ptr [rcx+Disp], loads 16 unsigned byte values from source pixel buffer and saves these values as unsigned doublewords in register ZMM0. The ensuing vpcmpud k1,zmm0,zmm31,CMP_GE, vpcmpud k2,zmm0,zmm30,CMP_LE, and kandw k3,k2,k1 instructions load opmask register K3 with a mask value of pixels that are greater than or equal to pixel_val_min and less than or equal to pixel_val_max. The vmovdqa32 zmm1{k3}{z},zmm0 that follows uses zero masking to effectively eliminate out-of-range pixel values from further calculations. The subsequent vpaddd and vpmulld instructions then update the packed doubleword quantities pixel_sum and pixel_sum_of_squares. The total number of in-range pixels in R10 is then updated using the instructions kmovw rax,k3, popcnt rax,rax, and add r10,rax. Figure 14-2 illustrates these calculations in greater detail. Note that this figure shows only the low-order 256 bits of each ZMM registers and the low-order 8 bits of each opmask register.
../images/326959_2_En_14_Chapter/326959_2_En_14_Fig2_HTML.jpg
Figure 14-2.

Calculations performed by instructions in macro _UpdateSums

Following the four _UpdateSums usages, the doubleword elements of registers ZMM16 and ZMM17 contain packed copies of the values pixel_sum and pixel_sum_of_squares for the current block of 64 pixels. The vextracti32x8 ymm0,zmm16,1 and vpaddd ymm1,ymm0,ymm16 instructions reduce the number of doubleword values in register ZMM16 from 16 to 8. The ensuing vpmovzxdq zmm2,ymm1 instruction promotes these doubleword values to quadwords, and the vpaddq zmm29,zmm29,zmm2 instruction updates the global packed quadword pixel_sum values that are maintained in register ZMM29. A similar sequence of instructions is then used to update the global packed quadword pixel_sum_of_squares values in register ZMM28. Following these instructions, the processing loop updates its pointer register and counters; it then repeats until the number of remaining pixels falls below 64.

The block of code that starts at the label LB1 computes pixel_sum and pixel_sum_of_squares for the final few pixels (if any) using scalar integer arithmetic and the general-purpose registers. A series of extract (vextracti64x4, vextracti64x2, and vpextrq) and vpaddq instructions reduce the eight packed quadword pixel_sum values in ZMM29 to a single quadword value. A similar sequence of instructions is then used to calculate the final value for pixel_sum_of_squares. Note that these intermediate results are saved in the ImageStats structure pointed to by register RCX. The function Avx512CalcImageStats_ then executes a chain of vcvtusi2sd instructions to convert the intermediate results from unsigned quadword integers to double-precision floating-point. The final mean and standard deviation values are calculated using scalar double-precision floating-point arithmetic. The results for source code example Ch14_04 follow this paragraph. Table 14-1 shows benchmark timing measurements for the C++ and assembly language calculating functions Avx512CalcImageStatsCpp and Avx512CalcImageStats_.
Results for Avx512CalcImageStats
image_fn:       ..Ch14_DataTestImage4.bmp
num_pixels:      258130
c_PixelValMin:    40
c_PixelValMax:    230
m_NumPixelsInRange:  229897    | 229897
m_PixelSum:      32574462   | 32574462
m_PixelSumOfSquares: 5139441032  | 5139441032
m_PixelMean:     141.691549  | 141.691549
m_PixelSd:      47.738056  | 47.738056
Running benchmark function Avx512CalcImageStats_BM - please wait
Benchmark times save to file Ch14_04_Avx512CalcImageStats_BM_CHROMIUM.csv
Table 14-1.

Benchmark Timing Measurements for Image Statistics Calculating Functions Using TestImage4.bmp

CPU

Avx512CalcImageStatsCpp

Avx512CalcImageStats_

i7-4790S

----

----

i9-7900X

404

29

i7-8700K

----

----

RGB to Grayscale Conversion

In Chapter 10, you learned how to use the AVX2 instruction set to convert an RGB image to a grayscale image (see example Ch10_06). Listing 14-5 shows the source code for example Ch14_05, which illustrates RGB to grayscale image conversion using the AVX-512 instruction set.
//------------------------------------------------
//        Ch14_05.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <stdexcept>
#include <iomanip>
#include "Ch14_05.h"
#include "ImageMatrix.h"
#include "AlignedMem.h"
using namespace std;
extern "C" const int c_NumPixelsMin = 64;
extern "C" const int c_NumPixelsMax = 16 * 1024 * 1024;
// RGB to grayscale conversion coefficients
const float c_Coef[3] {0.2126f, 0.7152f, 0.0722f};
bool CompareGsImages(const uint8_t* pb_gs1,const uint8_t* pb_gs2, int num_pixels)
{
  for (int i = 0; i < num_pixels; i++)
  {
    if (abs((int)pb_gs1[i] - (int)pb_gs2[i]) > 1)
      return false;
  }
  return true;
}
bool Avx512RgbToGsCpp(uint8_t* pb_gs, const uint8_t* const* pb_rgb, int num_pixels, const float coef[3])
{
  if (num_pixels < c_NumPixelsMin || num_pixels > c_NumPixelsMax)
    return false;
  if (num_pixels % 64 != 0)
    return false;
  if (!AlignedMem::IsAligned(pb_gs, 64))
    return false;
  const size_t align = 64;
  const uint8_t* pb_r = pb_rgb[0];
  const uint8_t* pb_g = pb_rgb[1];
  const uint8_t* pb_b = pb_rgb[2];
  if (!AlignedMem::IsAligned(pb_r, align))
    return false;
  if (!AlignedMem::IsAligned(pb_g, align))
    return false;
  if (!AlignedMem::IsAligned(pb_b, align))
    return false;
  for (int i = 0; i < num_pixels; i++)
  {
    uint8_t r = pb_r[i];
    uint8_t g = pb_g[i];
    uint8_t b = pb_b[i];
    float gs_temp = r * coef[0] + g * coef[1] + b * coef[2] + 0.5f;
    if (gs_temp < 0.0f)
      gs_temp = 0.0f;
    else if (gs_temp > 255.0f)
      gs_temp = 255.0f;
    pb_gs[i] = (uint8_t)gs_temp;
  }
  return true;
}
void Avx512RgbToGs(void)
{
  const wchar_t* fn_rgb = L"..\Ch14_Data\TestImage3.bmp";
  const wchar_t* fn_gs1 = L"Ch14_05_Avx512RgbToGs_TestImage3_GS1.bmp";
  const wchar_t* fn_gs2 = L"Ch14_05_Avx512RgbToGs_TestImage3_GS2.bmp";
  const wchar_t* fn_gs3 = L"Ch14_05_Avx512RgbToGs_TestImage3_GS3.bmp";
  ImageMatrix im_rgb(fn_rgb);
  int im_h = im_rgb.GetHeight();
  int im_w = im_rgb.GetWidth();
  int num_pixels = im_h * im_w;
  ImageMatrix im_r(im_h, im_w, PixelType::Gray8);
  ImageMatrix im_g(im_h, im_w, PixelType::Gray8);
  ImageMatrix im_b(im_h, im_w, PixelType::Gray8);
  RGB32* pb_rgb = im_rgb.GetPixelBuffer<RGB32>();
  uint8_t* pb_r = im_r.GetPixelBuffer<uint8_t>();
  uint8_t* pb_g = im_g.GetPixelBuffer<uint8_t>();
  uint8_t* pb_b = im_b.GetPixelBuffer<uint8_t>();
  uint8_t* pb_rgb_cp[3] {pb_r, pb_g, pb_b};
  for (int i = 0; i < num_pixels; i++)
  {
    pb_rgb_cp[0][i] = pb_rgb[i].m_R;
    pb_rgb_cp[1][i] = pb_rgb[i].m_G;
    pb_rgb_cp[2][i] = pb_rgb[i].m_B;
  }
  ImageMatrix im_gs1(im_h, im_w, PixelType::Gray8);
  ImageMatrix im_gs2(im_h, im_w, PixelType::Gray8);
  ImageMatrix im_gs3(im_h, im_w, PixelType::Gray8);
  uint8_t* pb_gs1 = im_gs1.GetPixelBuffer<uint8_t>();
  uint8_t* pb_gs2 = im_gs2.GetPixelBuffer<uint8_t>();
  uint8_t* pb_gs3 = im_gs3.GetPixelBuffer<uint8_t>();
  // Exercise conversion functions
  bool rc1 = Avx512RgbToGsCpp(pb_gs1, pb_rgb_cp, num_pixels, c_Coef);
  bool rc2 = Avx512RgbToGs_(pb_gs2, pb_rgb_cp, num_pixels, c_Coef);
  bool rc3 = Avx2RgbToGs_(pb_gs3, pb_rgb_cp, num_pixels, c_Coef);
  if (rc1 && rc2 && rc3)
  {
    im_gs1.SaveToBitmapFile(fn_gs1);
    im_gs2.SaveToBitmapFile(fn_gs2);
    im_gs2.SaveToBitmapFile(fn_gs3);
    bool c1 = CompareGsImages(pb_gs1, pb_gs2, num_pixels);
    bool c2 = CompareGsImages(pb_gs2, pb_gs3, num_pixels);
    if (c1 && c2)
      cout << "Grayscale image compare OK ";
    else
      cout << "Grayscale image compare failed ";
  }
  else
    cout << "Invalid return code ";
}
int main()
{
  try
  {
    Avx512RgbToGs();
    Avx512RgbToGs_BM();
  }
  catch (runtime_error& rte)
  {
    cout << "'runtime_error' exception has occurred - " << rte.what() << ' ';
  }
  catch (...)
  {
    cout << "Unexpected exception has occurred ";
  }
  return 0;
}
;-------------------------------------------------
;        Ch14_05.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
    extern c_NumPixelsMin:dword
    extern c_NumPixelsMax:dword
      .const
r4_0p5   real4 0.5
r4_255p0  real4 255.0
; extern "C" bool Avx512RgbToGs_(uint8_t* pb_gs, const uint8_t* const* pb_rgb, int num_pixels, const float coef[3]);
    .code
Avx512RgbToGs_ proc frame
    _CreateFrame RGBGS0_,0,96,r13,r14,r15
    _SaveXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15
    _EndProlog
    xor eax,eax             ;error return code (also pixel_buffer offset)
    cmp r8d,[c_NumPixelsMin]
    jl Done               ;jump if num_pixels < min value
    cmp r8d,[c_NumPixelsMax]
    jg Done               ;jump if num_pixels > max value
    test r8d,3fh
    jnz Done              ;jump if (num_pixels % 64) != 0
    test rcx,3fh
    jnz Done              ;jump if pb_gs is not aligned
    mov r13,[rdx]
    test r13,3fh
    jnz Done              ;jump if pb_r is not aligned
    mov r14,[rdx+8]
    test r14,3fh
    jnz Done              ;jump if pb_g is not aligned
    mov r15,[rdx+16]
    test r15,3fh
    jnz Done              ;jump if pb_b is not aligned
; Perform required initializations
    vbroadcastss zmm10,real4 ptr [r9]    ;zmm10 = packed coef[0]
    vbroadcastss zmm11,real4 ptr [r9+4]   ;zmm11 = packed coef[1]
    vbroadcastss zmm12,real4 ptr [r9+8]   ;zmm12 = packed coef[2]
    vbroadcastss zmm13,real4 ptr [r4_0p5]  ;zmm13 = packed 0.5
    vbroadcastss zmm14,real4 ptr [r4_255p0] ;zmm14 = packed 255.0
    vxorps zmm15,zmm15,zmm15        ;zmm15 = packed 0.0
    mov r8d,r8d               ;r8 = num_pixels
    mov r10,16               ;r10 - number of pixels / iteration
; Load next block of pixels
    align 16
@@:   vpmovzxbd zmm0,xmmword ptr [r13+rax]  ;zmm0 = 16 pixels (r values)
    vpmovzxbd zmm1,xmmword ptr [r14+rax]  ;zmm1 = 16 pixels (g values)
    vpmovzxbd zmm2,xmmword ptr [r15+rax]  ;zmm2 = 16 pixels (b values)
; Convert dword values to SPFP and multiply by coefficients
    vcvtdq2ps zmm0,zmm0         ;zmm0 = 16 pixels SPFP (r values)
    vcvtdq2ps zmm1,zmm1         ;zmm1 = 16 pixels SPFP (g values)
    vcvtdq2ps zmm2,zmm2         ;zmm2 = 16 pixels SPFP (b values)
    vmulps zmm0,zmm0,zmm10       ;zmm0 = r values * coef[0]
    vmulps zmm1,zmm1,zmm11       ;zmm1 = g values * coef[1]
    vmulps zmm2,zmm2,zmm12       ;zmm2 = b values * coef[2]
; Sum color components & clip values to [0.0, 255.0]
    vaddps zmm3,zmm0,zmm1        ;r + g
    vaddps zmm4,zmm3,zmm2        ;r + g + b
    vaddps zmm5,zmm4,zmm13       ;r + g + b + 0.5
    vminps zmm0,zmm5,zmm14       ;clip pixels above 255.0
    vmaxps zmm1,zmm0,zmm15       ;clip pixels below 0.0
; Convert grayscale values from SPFP to byte, save results
    vcvtps2dq zmm2,zmm1         ;convert SPFP values to dwords
    vpmovusdb xmm3,zmm2         ;convert to bytes
    vmovdqa xmmword ptr [rcx+rax],xmm3 ;save grayscale image pixels
    add rax,r10
    sub r8,r10
    jnz @B
    mov eax,1              ;set success return code
Done:  vzeroupper
    _RestoreXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15
    _DeleteFrame r13,r14,r15
    ret
Avx512RgbToGs_ endp
; extern "C" bool Avx2RgbToGs_(uint8_t* pb_gs, const uint8_t* const* pb_rgb, int num_pixels, const float coef[3]);
    .code
Avx2RgbToGs_ proc frame
    _CreateFrame RGBGS1_,0,96,r13,r14,r15
    _SaveXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15
    _EndProlog
    xor eax,eax             ;error return code (also pixel_buffer offset)
    cmp r8d,[c_NumPixelsMin]
    jl Done               ;jump if num_pixels < min value
    cmp r8d,[c_NumPixelsMax]
    jg Done               ;jump if num_pixels > max value
    test r8d,3fh
    jnz Done              ;jump if (num_pixels % 64) != 0
    test rcx,3fh
    jnz Done              ;jump if pb_gs is not aligned
    mov r13,[rdx]
    test r13,3fh
    jnz Done              ;jump if pb_r is not aligned
    mov r14,[rdx+8]
    test r14,3fh
    jnz Done              ;jump if pb_g is not aligned
    mov r15,[rdx+16]
    test r15,3fh
    jnz Done              ;jump if pb_b is not aligned
; Perform required initializations
    vbroadcastss ymm10,real4 ptr [r9]    ;ymm10 = packed coef[0]
    vbroadcastss ymm11,real4 ptr [r9+4]   ;ymm11 = packed coef[1]
    vbroadcastss ymm12,real4 ptr [r9+8]   ;ymm12 = packed coef[2]
    vbroadcastss ymm13,real4 ptr [r4_0p5]  ;ymm13 = packed 0.5
    vbroadcastss ymm14,real4 ptr [r4_255p0] ;ymm14 = packed 255.0
    vxorps ymm15,ymm15,ymm15        ;ymm15 = packed 0.0
    mov r8d,r8d               ;r8 = num_pixels
    mov r10,8                ;r10 - number of pixels / iteration
; Load next block of pixels
    align 16
@@:   vpmovzxbd ymm0,qword ptr [r13+rax]   ;ymm0 = 8 pixels (r values)
    vpmovzxbd ymm1,qword ptr [r14+rax]   ;ymm1 = 8 pixels (g values)
    vpmovzxbd ymm2,qword ptr [r15+rax]   ;ymm2 = 8 pixels (b values)
; Convert dword values to SPFP and multiply by coefficients
    vcvtdq2ps ymm0,ymm0         ;ymm0 = 8 pixels SPFP (r values)
    vcvtdq2ps ymm1,ymm1         ;ymm1 = 8 pixels SPFP (g values)
    vcvtdq2ps ymm2,ymm2         ;ymm2 = 8 pixels SPFP (b values)
    vmulps ymm0,ymm0,ymm10       ;ymm0 = r values * coef[0]
    vmulps ymm1,ymm1,ymm11       ;ymm1 = g values * coef[1]
    vmulps ymm2,ymm2,ymm12       ;ymm2 = b values * coef[2]
; Sum color components & clip values to [0.0, 255.0]
    vaddps ymm3,ymm0,ymm1        ;r + g
    vaddps ymm4,ymm3,ymm2        ;r + g + b
    vaddps ymm5,ymm4,ymm13       ;r + g + b + 0.5
    vminps ymm0,ymm5,ymm14       ;clip pixels above 255.0
    vmaxps ymm1,ymm0,ymm15       ;clip pixels below 0.0
; Convert grayscale components from SPFP to byte, save results
    vcvtps2dq ymm2,ymm1         ;convert SPFP values to dwords
    vpackusdw ymm3,ymm2,ymm2
    vextracti128 xmm4,ymm3,1
    vpackuswb xmm5,xmm3,xmm4      ;byte GS pixels in xmm5[31:0] and xmm5[95:64]
    vpextrd r11d,xmm5,0         ;r11d = 4 grayscale pixels
    mov dword ptr [rcx+rax],r11d    ;save grayscale image pixels
    vpextrd r11d,xmm5,2         ;r11d = 4 grayscale pixels
    mov dword ptr [rcx+rax+4],r11d   ;save grayscale image pixels
    add rax,r10
    sub r8,r10
    jnz @B
    mov eax,1              ;set success return code
Done:  vzeroupper
    _RestoreXmmRegs xmm10,xmm11,xmm12,xmm13,xmm14,xmm15
    _DeleteFrame r13,r14,r15
    ret
Avx2RgbToGs_ endp
    end
Listing 14-5.

Example Ch14_05

The algorithm that’s used in this example to perform RGB to image grayscale conversion is the same one that was used in Ch10_06. As explained in Chapter 10, the algorithm uses a simple weighted average to transform an RGB image pixel into a grayscale image pixel. The C++ function Avx512RgbToGs begins its execution by loading the test image file. It then copies the RGB pixels of im_rgb into three separate color component image buffers. The reason for doing this is that this example’s RGB to grayscale conversion functions require a structure of arrays (AOS) instead of an array of structures (SOA) , which was employed in source code example Ch10_06. Following allocation of the grayscale image buffers, Avx512RgbToGs invokes the C++ and assembly language conversion functions. The resultant grayscale image buffers are then compared for equality and saved.

The assembly language code in Listing 14-5 includes two functions: Avx512Rgb2Gs_ and Avx2Rgb2Gs_. As implied by their respective name prefixes, these functions perform RGB to grayscale image conversions using AVX-512 and AVX2 instructions, respectively. The function Avx512Rgb2Gs_ begins its execution by validating num_pixels for size and divisibility by 64. It then checks the source and destination pixel buffers for proper alignment. The ensuing series of vbroadcastss instructions load packed versions of the color conversion coefficients into registers ZMM10, ZMM11, and ZMM12. This is followed by another set of vbroadcastss instructions that broadcast the single-precision floating-point constants 0.5, 255.0, and 0.0 to registers ZMM13, ZMM14, and ZMM15. The mov r8d,r8d instruction zero-extends num_pixels into R8, and the mov r10,16 instruction loads R10 with the number of pixels to process during each loop iteration.

Each Avx512Rgb2Gs_ processing loop iteration in starts with three vpmovzxbd instructions that load 16 red, green, and, blue pixel values into registers ZMM0, ZMM1, and ZMM2. The ensuing vcvtdq2ps instructions convert the doubleword pixel values to single-precision floating-point. The floating-point color values are then multiplied by the corresponding color coefficients using a series of vmulps instructions. These values are then summed using three vaddps instructions. The resultant 16 grayscale pixel values are then clipped to [0.0, 255.0] and converted to doubleword values. The vpmovusdb xmm3,zmm2 instruction size-reduces the doubleword values to bytes using unsigned saturation, and the vmovdqa xmmword ptr [rcx+rax],xmm3 instruction saves the 16 byte pixel values to the destination grayscale image buffer.

The assembly language function Avx2Rgb2Gs_ is identical to its AVX-512 counterpart except for two minor changes: Avx2Rgb2Gs_ uses AVX2 instructions and the YMM register set to carry out the required calculations; it also uses the vpackusdw and vpackuswb instructions in conjunction with a few other instructions to perform the doubleword to byte size reductions . The reason for this is that AVX2 does not support the vpmovusdb instruction. Here is the output for source code example Ch14_05:
Grayscale image compare OK
Running benchmark function Avx512RgbToGs_BM - please wait
Benchmark times save to file Ch14_05_Avx512RgbToGs_BM_CHROMIUM.csv
Table 14-2 shows the benchmark timing measurements for source code example Ch14_05.
Table 14-2.

Mean Execution Times (Microseconds) for RGB to Grayscale Image Conversion Using TestImage3.bmp

CPU

Avx512RgbToGsCpp

Avx512Rgb2Gs_

Avx2Rgb2Gs_

i7-4790S

----

----

----

i9-7900X

1125

134

259

i7-8700K

----

----

----

The benchmark time differences between the AVX-512 and AVX2 implementations of the RGB to grayscale conversion algorithm are consistent with what one might expect. It is interesting to compare these numbers with the benchmark timing measurements from source code example Ch10_06 (see Table 10-2). This earlier example used an array of RGB32 pixels (or AOS) for the source image buffer, and the mean execution time for the conversion function Avx2ConvertRgbToGs_ was 593 microseconds. The current example exploits separate image pixel buffers for each color component (or SOA) , which significantly improves performance.

Summary

Here are the key learning points for Chapter 14:
  • Assembly language functions can use AVX-512 promoted versions of most AVX and AVX2 packed integer instructions to perform operations using 512-, 256-, and 128-bit wide operands.

  • Assembly language functions can use the vmovdqa[32|64] and vmovdqu[8|16|32|64] to perform aligned and unaligned moves of packed integer operands.

  • Assembly language functions can use the vpmovus[qd|qw|qb|dw|db|wb] instructions to carry out packed integer size reductions using unsigned saturation. AVX-512 also supports an analogous set of packed integer size-reducing instructions using signed saturation.

  • The vpcmpu[b|w|d|q] instructions perform packed unsigned integer compare operations and save the resultant compare mask to an opmask register.

  • The vpand[d|q], vpandn[d|q], vpor[d|q], and vpxor[d|q] instructions can be used with an opmask register to perform merge or zero masking using doubleword or quadword elements.

  • The vextracti[32x4|32x8|64x2|64x4] instructions can be used to extract packed doubleword or quadword values from a packed integer operand.

  • When performing SIMD calculations using packed integer or floating-point operands, a structure-of-arrays construct is often significantly faster than an array-of-structures construct.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.129.23.30