Ā© The Author(s), under exclusive license to APress Media, LLC, part of Springer Nature 2022
D. KusswurmModern Parallel Programming with C++ and Assembly Languagehttps://doi.org/10.1007/978-1-4842-7918-2_13

13. AVX Assembly Language Programming: Part 1

Daniel Kusswurm1  
(1)
Geneva, IL, USA
 

The previous two chapters explored the basics of x86-64 assembly language programing. In these chapters, you learned how to perform simple integer arithmetic using x86-64 instructions. You also learned how to carry out scalar floating-point calculations using AVX instructions. Finally, you studied important x86-64 assembly language programming constructs and concepts including for-loop coding, memory addressing modes, use of condition codes, and function calling convention requirements.

In this chapter, you will discover how to code x86-64 assembly language functions that perform packed integer operations using AVX instructions and 128-bit wide operands. The first section covers basic packed integer arithmetic. The second section details a few image processing algorithms. The source code examples presented in this chapter are adaptations of examples that you saw in Chapter 2. This was done intentionally to highlight the programming similarities that exist between C++ SIMD intrinsic functions and AVX instructions.

Integer Arithmetic

In this section, you will learn how to perform elementary packed integer arithmetic using x86-64 assembly language and AVX instructions. The first example explains packed integer addition and subtraction using 128-bit wide SIMD operands. This is followed by an example that demonstrates packed integer multiplication. The final two examples illustrate packed integer bitwise logical and shift operations.

Addition and Subtraction

Listing 13-1 shows the source code for example Ch13_01. This example explicates packed integer addition and subtraction using AVX instructions and 128-bit wide operands.
//------------------------------------------------
//               Ch13_01.h
//------------------------------------------------
#pragma once
#include "XmmVal.h"
// Ch13_01_fasm.asm
extern "C" void AddI16_Aavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b);
extern "C" void SubI16_Aavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b);
//------------------------------------------------
//               Ch13_01.cpp
//------------------------------------------------
#include <iostream>
#include <iomanip>
#include "Ch13_01.h"
static void AddI16(void);
static void SubI16(void);
int main()
{
    AddI16();
    SubI16();
    return 0;
}
static void AddI16(void)
{
    const char nl = ' ';
    XmmVal a, b, c1, c2;
    // Packed int16_t addition
    a.m_I16[0] = 10;          b.m_I16[0] = 100;
    a.m_I16[1] = 200;         b.m_I16[1] = -200;
    a.m_I16[2] = 30;          b.m_I16[2] = 32760;
    a.m_I16[3] = -32766;      b.m_I16[3] = -400;
    a.m_I16[4] = 50;          b.m_I16[4] = 500;
    a.m_I16[5] = 60;          b.m_I16[5] = -600;
    a.m_I16[6] = 32000;       b.m_I16[6] = 1200;
    a.m_I16[7] = -32000;      b.m_I16[7] = -950;
    AddI16_Aavx(&c1, &c2, &a, &b);
    std::cout << " Results for AddI16_Aavx - Wraparound Addition ";
    std::cout << "a:  " << a.ToStringI16() << nl;
    std::cout << "b:  " << b.ToStringI16() << nl;
    std::cout << "c1: " << c1.ToStringI16() << nl;
    std::cout << " Results for AddI16_Aavx - Saturated Addition ";
    std::cout << "a:  " << a.ToStringI16() << nl;
    std::cout << "b:  " << b.ToStringI16() << nl;
    std::cout << "c2: " << c2.ToStringI16() << nl;
}
static void SubI16(void)
{
    const char nl = ' ';
    XmmVal a, b, c1, c2;
    a.m_I16[0] = 10;          b.m_I16[0] = 100;
    a.m_I16[1] = 200;         b.m_I16[1] = -200;
    a.m_I16[2] = -30;         b.m_I16[2] = 32760;
    a.m_I16[3] = -32766;      b.m_I16[3] = 400;
    a.m_I16[4] = 50;          b.m_I16[4] = 500;
    a.m_I16[5] = 60;          b.m_I16[5] = -600;
    a.m_I16[6] = 32000;       b.m_I16[6] = 1200;
    a.m_I16[7] = -32000;      b.m_I16[7] = 950;
    SubI16_Aavx(&c1, &c2, &a, &b);
    std::cout << " Results for SubI16_Aavx - Wraparound Subtraction ";
    std::cout << "a:  " << a.ToStringI16() << nl;
    std::cout << "b:  " << b.ToStringI16() << nl;
    std::cout << "c1: " << c1.ToStringI16() << nl;
    std::cout << " Results for SubI16_Aavx - Saturated Subtraction ";
    std::cout << "a:  " << a.ToStringI16() << nl;
    std::cout << "b:  " << b.ToStringI16() << nl;
    std::cout << "c2: " << c2.ToStringI16() << nl;
}
;-------------------------------------------------
;               Ch13_01_fasm.asm
;-------------------------------------------------
;--------------------------------------------------------------------------
; extern "C" void AddI16_Aavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
        .code
AddI16_Aavx proc
        vmovdqa xmm0,xmmword ptr [r8]       ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r9]       ;xmm1 = b
        vpaddw xmm2,xmm0,xmm1               ;packed add - wraparound
        vpaddsw xmm3,xmm0,xmm1              ;packed add - saturated
        vmovdqa xmmword ptr [rcx],xmm2      ;save c1
        vmovdqa xmmword ptr [rdx],xmm3      ;save c2
        ret
AddI16_Aavx endp
;--------------------------------------------------------------------------
; extern "C" void SubI16_Aavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
SubI16_Aavx proc
        vmovdqa xmm0,xmmword ptr [r8]       ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r9]       ;xmm1 = b
        vpsubw xmm2,xmm0,xmm1               ;packed sub - wraparound
        vpsubsw xmm3,xmm0,xmm1              ;packed sub - saturated
        vmovdqa xmmword ptr [rcx],xmm2      ;save c1
        vmovdqa xmmword ptr [rdx],xmm3      ;save c2
        ret
SubI16_Aavx endp
        end
Listing 13-1

Example Ch13_01

The first file in Listing 13-1, Ch13_01.h, contains the function declarations for this example. Note that functions AddI16_Aavx() and SubI16_Aavx() both require pointer arguments of type XmmVal . This is the same C++ SIMD data structure that was introduced in Chapter 2. The file Ch13_01.cpp contains code that performs test case initialization and streams results to std::cout.

The first function in file Ch13_01_fasm.asm, AddI16_Aavx(), illustrates packed integer addition using 16-bit wide elements. Function AddI16_Aavx() begins with a vmovdqa xmm0,xmmword ptr [r8] that loads argument value a into register XMM0. The text xmmword ptr is an assembler operator that conveys the size (128 bits) of the source operand pointed to by R8. The next instruction, vmovdqa xmm1,xmmword ptr [r9], loads argument value b into register XMM1. The ensuing instruction pair, vpaddw xmm2,xmm0,xmm1 (Add Packed Integers) and vpaddsw xmm3,xmm0,xmm1 (Add Packed Integers with Signed Saturation), performs packed integer addition of word elements using wraparound and saturated arithmetic, respectively. The final two AVX instructions of AddI16_Aavx(), vmovdqa xmmword ptr [rcx],xmm2 and vmovdqa xmmword ptr [rdx],xmm3, save the calculated results to the XmmVal buffers pointed to by c1 and c2.

Recall that source code example Ch02_01 included a C++ function named AddI16_Iavx(). This function employed _mm_load_si128() and _mm_store_si128() to perform SIMD load and store operations. In the current example, the assembly language function AddI16_Aavx() (which performs the same operations as AddI16_Iavx()) uses the vmovdqa instruction to perform SIMD loads and stores. Function AddI16_Iavx() also used _mm_add_epi16() and _mm_adds_epi16() to carry out packed integer addition using 16-bit wide integer elements. These C++ SIMD intrinsic functions are the counterparts of the AVX instructions vpaddw and vpaddsw . Most of the C++ SIMD intrinsic functions that you learned about in the first half of this book are essentially wrapper functions for x86-AVX instructions.

The second function in file Ch13_01_fasm.asm, named SubI16_Aavx(), performs packed subtraction using 16-bit wide integer elements. The code arrangement of this function is identical to AddI16_Aavx() except for the use of vpsubw (Subtract Packed Integers) and vpsubsw (Subtract Packed Integers with Signed Saturation) to carry out packed integer subtraction. X86-AVX also includes other size variants of the addition and subtraction instructions demonstrated in this example. These are listed in the summary table (Table 13-3) located at the end of this chapter. Here are the results for source code example Ch13_01:
Results for AddI16_Aavx - Wraparound Addition
a:        10     200      30  -32766   |      50      60   32000  -32000
b:       100    -200   32760    -400   |     500    -600    1200    -950
c1:      110       0  -32746   32370   |     550    -540  -32336   32586
Results for AddI16_Aavx - Saturated Addition
a:        10     200      30  -32766   |      50      60   32000  -32000
b:       100    -200   32760    -400   |     500    -600    1200    -950
c2:      110       0   32767  -32768   |     550    -540   32767  -32768
Results for SubI16_Aavx - Wraparound Subtraction
a:        10     200     -30  -32766   |      50      60   32000  -32000
b:       100    -200   32760     400   |     500    -600    1200     950
c1:      -90     400   32746   32370   |    -450     660   30800   32586
Results for SubI16_Aavx - Saturated Subtraction
a:        10     200     -30  -32766   |      50      60   32000  -32000
b:       100    -200   32760     400   |     500    -600    1200     950
c2:      -90     400  -32768  -32768   |    -450     660   30800  -32768

Multiplication

The next source code example, named Ch13_02, demonstrates how to perform packed integer multiplication. This example is the assembly language counterpart of source example Ch02_03. Listing 13-2 shows the assembly language code for example Ch13_02. The C++ test case initialization code for this example is not shown in Listing 13-2 since it is identical to the code used in example Ch02_03.
;-------------------------------------------------
;               Ch13_02_fasm.asm
;-------------------------------------------------
;--------------------------------------------------------------------------
; extern "C" void MulI16_Aavx(XmmVal c[2], const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
        .code
MulI16_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r8]       ;xmm1 = b
        vpmullw xmm2,xmm0,xmm1              ;packed mul - low result
        vpmulhw xmm3,xmm0,xmm1              ;packed mul - high result
        vpunpcklwd xmm4,xmm2,xmm3           ;packed low-order dwords
        vpunpckhwd xmm5,xmm2,xmm3           ;packed high-order dwords
        vmovdqa xmmword ptr [rcx],xmm4      ;save c[0]
        vmovdqa xmmword ptr [rcx+16],xmm5   ;save c[1]
        ret
MulI16_Aavx endp
;--------------------------------------------------------------------------
; extern "C" void MulI32a_Aavx(XmmVal* c, const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
MulI32a_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r8]       ;xmm1 = b
        vpmulld xmm2,xmm0,xmm1              ;packed mul - low result
        vmovdqa xmmword ptr [rcx],xmm2      ;save c
        ret
MulI32a_Aavx endp
;--------------------------------------------------------------------------
; extern "C" void MulI32b_Aavx(XmmVal c[2], const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
MulI32b_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r8]       ;xmm1 = b
        vpmuldq xmm2,xmm0,xmm1              ;packed mul - a & b even dwords
        vpsrldq xmm3,xmm0,4                 ;shift a_vals right 4 bytes
        vpsrldq xmm4,xmm1,4                 ;shift b_vals right 4 bytes
        vpmuldq xmm5,xmm3,xmm4              ;packed mul - a & b odd dwords
        vpextrq qword ptr [rcx],xmm2,0      ;save qword product 0
        vpextrq qword ptr [rcx+8],xmm5,0    ;save qword product 1
        vpextrq qword ptr [rcx+16],xmm2,1   ;save qword product 2
        vpextrq qword ptr [rcx+24],xmm5,1   ;save qword product 3
        ret
MulI32b_Aavx endp
        end
Listing 13-2

Example Ch13_02

The file Ch13_02_fasm.asm contains three functions that perform packed integer multiplication. The first function, MulI16_Aavx(), begins its execution with two vmovdqa instructions that load argument values a and b into registers XMM0 and XMM1, respectively. This is followed by a vpmullw xmm2,xmm0,xmm1 (Multiply Packed Signed Integers and Store Low Result) instruction that performs packed signed integer multiplication using the 16-bit wide elements of XMM0 and XMM1. The vpmullw instruction saves the low-order 16 bits of each 32-bit product in register XMM2. The vpmulhw xmm3,xmm0,xmm1 (Multiply Packed Signed Integers and Store High Result) that follows calculates and saves the high-order 16 bits of each 32-bit product in register XMM3. The ensuing instruction pair, vpunpcklwd xmm4,xmm2,xmm3 (Unpack Low Data) and vpunpckhwd xmm5,xmm2,xmm3 (Unpack High Data), interleaves the low- and high-order word elements of their respective source operands to form the final doubleword products as shown in Figure 13-1. The last two AVX instructions of MulI16_Aavx(), vmovdqa xmmword ptr [rcx],xmm4 and vmovdqa xmmword ptr [rcx+16],xmm5, save the calculated products to c[0] and c[1]. Note that the second vmovdqa instruction uses a displacement value of 16 since each XmmVal structure instance in array c is 16 bytes wide.
Figure 13-1

Packed 16-bit signed integer multiplication using vpmullw , vpmulhw , vpunpcklwd , and vpunpckhwd

The next function in Ch13_02_fasm.asm, MulI32a_Aavx(), performs packed signed integer multiplication using 32-bit wide elements. Note that this function only saves the low-order 32 bits of each 64-bit product. The final function in Listing 13-2, MulI32b_Aavx(), performs packed signed integer multiplication using 32-bit wide elements and saves complete 64-bit products. Function MulI32b_Aavx() begins its execution with two vmovdqa instructions that load argument values a and b into registers XMM0 and XMM1. The next instruction, vpmuldq xmm2,xmm0,xmm1 (Multiple Packed Doubleword Integers), performs packed 32-bit signed integer multiplication using the even-numbered elements of XMM0 and XMM1 and saves the resultant 64-bit products in register XMM2. The ensuing vpsrldq xmm3,xmm0,4 (Shift Double Quadword Right) and vpsrldq xmm4,xmm1,4 instructions right shift registers XMM0 (a) and XMM1 (b) by 4 bytes. This facilitates the use of the next instruction, vpmuldq xmm5,xmm3,xmm4, which calculates products using the odd-numbered elements of a and b as illustrated in Figure 13-2.
Figure 13-2

Packed 32-bit signed integer multiplication using vpmuldq and vpsrldq

Following calculation of the quadword products, MulI32b_Aavx() uses four vpextrq (Extract Quadword) instructions to save the results. Note that the immediate constant used with each vpextrq instruction selects which quadword element to extract from the first source operand. Also note that each vpextrq instruction specifies a destination operand in memory. This is different than most x86-AVX instructions, which require the destination operand to be an XMM, YMM, or ZMM register. Here are the results for source code example Ch13_02:
Results for MulI16_Aavx
a[0]:       10  b[0]:       -5  c[0][0]:          -50
a[1]:     3000  b[1]:      100  c[0][1]:       300000
a[2]:    -2000  b[2]:    -9000  c[0][2]:     18000000
a[3]:       42  b[3]:     1000  c[0][3]:        42000
a[4]:    -5000  b[4]:    25000  c[1][0]:   -125000000
a[5]:        8  b[5]:    16384  c[1][1]:       131072
a[6]:    10000  b[6]:     3500  c[1][2]:     35000000
a[7]:      -60  b[7]:     6000  c[1][3]:      -360000
---------------------------------------------------------------------------
Results for MulI32a_Aavx
a[0]:         10  b[0]:       -500  c[0]:      -5000
a[1]:       3000  b[1]:        100  c[1]:     300000
a[2]:      -2000  b[2]:     -12000  c[2]:   24000000
a[3]:       4200  b[3]:       1000  c[3]:    4200000
---------------------------------------------------------------------------
Results for MulI32b_Aavx
a[0]:         10  b[0]:       -500  c[0][0]:          -5000
a[1]:       3000  b[1]:        100  c[0][1]:         300000
a[2]:     -40000  b[2]:    -120000  c[1][0]:     4800000000
a[3]:       4200  b[3]:       1000  c[1][1]:        4200000

Bitwise Logical Operations

Listing 13-3 shows the assembly language code for source code example Ch13_03. This example spotlights the use of the AVX bitwise logical instructions vpand (Bitwise Logical AND), vpor (Bitwise Logical OR), and vpxor (Bitwise Logical Exclusive OR).
;-------------------------------------------------
;               Ch13_03_fasm.asm
;-------------------------------------------------
;--------------------------------------------------------------------------
; extern "C" void AndU16_Aavx(XmmVal* c, const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
        .code
AndU16_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r8]       ;xmm1 = b
        vpand xmm2,xmm0,xmm1                ;bitwise and
        vmovdqa xmmword ptr [rcx],xmm2      ;save result
        ret
AndU16_Aavx endp
;--------------------------------------------------------------------------
; extern "C" void OrU16_Aavx(XmmVal* c, const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
OrU16_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r8]       ;xmm1 = b
        vpor xmm2,xmm0,xmm1                 ;bitwise or
        vmovdqa xmmword ptr [rcx],xmm2      ;save result
        ret
OrU16_Aavx endp
;--------------------------------------------------------------------------
; extern "C" void XorU16_Aavx(XmmVal* c, const XmmVal* a, const XmmVal* b);
;--------------------------------------------------------------------------
XorU16_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovdqa xmm1,xmmword ptr [r8]       ;xmm1 = b
        vpxor xmm2,xmm0,xmm1                ;bitwise xor
        vmovdqa xmmword ptr [rcx],xmm2      ;save result
        ret
XorU16_Aavx endp
        end
Listing 13-3

Example Ch13_03

The functions shown in Listing 13-3 are the assembly language counterparts of the C++ SIMD calculating functions used in source code example Ch02_04. Function AndU16_Aavx() uses the vpand xmm2,xmm0,xmm1 instruction to perform a bitwise logical AND using the values in registers XMM0 and XMM1. Functions OrU16_Aavx() and XorU16_Aavx() are similar to AndU16_Aavx() but carry out bitwise logical OR and bitwise logical exclusive OR operations, respectively. Recall that when performing AVX (or AVX2) packed integer bitwise logical operations, the notion of distinct elements (e.g., byte, word, doubleword, or quadword) is not applicable. This explains why the instruction mnemonics vpand, vpor , and vpxor lack a size suffix letter. Here are the results for source code example Ch13_03:
Results for AndU16_Aavx
a:     1234    ABDC    AA55    1111   |    FFFF    7F7F    9876    7F00
b:     FF00    00FF    AAAA    5555   |    8000    7FFF    F0F0    0880
c:     1200    00DC    AA00    1111   |    8000    7F7F    9070    0800
Results for OrU16_Aavx
a:     1234    ABDC    AA55    1111   |    FFFF    7F7F    9876    7F00
b:     FF00    00FF    AAAA    5555   |    8000    7FFF    F0F0    0880
c:     FF34    ABFF    AAFF    5555   |    FFFF    7FFF    F8F6    7F80
Results for XorU16_Aavx
a:     1234    ABDC    AA55    1111   |    FFFF    7F7F    9876    7F00
b:     FF00    00FF    AAAA    5555   |    8000    7FFF    F0F0    0880
c:     ED34    AB23    00FF    4444   |    7FFF    0080    6886    7780

Arithmetic and Logical Shifts

The final source code example of this section, Ch13_04, highlights the use of the AVX instructions vpsllw (Shift Packed Data Left Logical), vpsrlw (Shift Packed Data Right Logical), and vpsraw (Shift Packed Data Right Arithmetic). Listing 13-4 shows the assembly language source code for example Ch13_04.
;-------------------------------------------------
;               Ch13_04_fasm.asm
;-------------------------------------------------
;--------------------------------------------------------------------------
; extern void SllU16_Aavx(XmmVal* c, const XmmVal* a, int count);
;--------------------------------------------------------------------------
        .code
SllU16_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovd xmm1,r8d                      ;xmm1[31:0] = count
        vpsllw xmm2,xmm0,xmm1               ;left shift word elements of a
        vmovdqa xmmword ptr [rcx],xmm2      ;save result
        ret
SllU16_Aavx endp
;--------------------------------------------------------------------------
; extern void SrlU16_Aavx(XmmVal* c, const XmmVal* a, int count);
;--------------------------------------------------------------------------
SrlU16_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovd xmm1,r8d                      ;xmm1[31:0] = count
        vpsrlw xmm2,xmm0,xmm1               ;right shift word elements of a
        vmovdqa xmmword ptr [rcx],xmm2      ;save result
        ret
SrlU16_Aavx endp
;--------------------------------------------------------------------------
; extern void SraU16_Aavx(XmmVal* c, const XmmVal* a, int count);
;--------------------------------------------------------------------------
SraU16_Aavx proc
        vmovdqa xmm0,xmmword ptr [rdx]      ;xmm0 = a
        vmovd xmm1,r8d                      ;xmm1[31:0] = count
        vpsraw xmm2,xmm0,xmm1               ;right shift word elements of a
        vmovdqa xmmword ptr [rcx],xmm2      ;save result
        ret
SraU16_Aavx endp
        end
Listing 13-4

Example Ch13_04

Function SllU16_Aavx() begins its execution with a vmovdqa xmm0,xmmword ptr [rdx] instruction that loads argument value a into register XMM0. The next instruction, vmovd xmm1,r8d (Move Doubleword), copies the doubleword value in register R8D (argument value count) to XMM1[31:0]. Execution of this instruction also zeros bits YMM1[255:32]; bits ZMM1[511:256] are likewise zeroed if the processor supports AVX-512. The ensuing vpsllw xmm2,xmm0,xmm1 instruction left shifts each word element in XMM0 using the shift count in XMM1[31:0].

Functions SrlU16_Aavx() and SraU16_Aavx() use a code arrangement that is similar to SllU16_Aavx(). Function SrlU16_Aavx() demonstrates the use of the vpsrlw instruction, while SraU16_Aavx() highlights the use of the vpsraw instruction. The AVX instructions vpsllw, vpsrlw, and vpsraw can also be used with an immediate operand that specifies the shift count. X86-AVX also includes instructions that perform shifts using doubleword and quadword elements. These are listed in the end-of-chapter summary table (Table 13-3). Here are the results for source code example Ch13_04:
Results for SllU16_Aavx - count = 8
a:     1234    FFB0    00CC    8080   |    00FF    AAAA    0F0F    0101
c:     3400    B000    CC00    8000   |    FF00    AA00    0F00    0100
Results for SrlU16_Aavx - count = 4
a:     1234    FFB0    00CC    8080   |    00FF    AAAA    0F0F    0101
c:     0123    0FFB    000C    0808   |    000F    0AAA    00F0    0010
Results for SraU16_Aavx - count = 4
a:     1234    FFB0    00CC    8080   |    00FF    AAAA    0F0F    0101
c:     0123    FFFB    000C    F808   |    000F    FAAA    00F0    0010

Image Processing Algorithms

In the first part of this book, several source code examples were presented that explained how to exploit C++ SIMD intrinsic functions to perform common image processing techniques. In this section, you will learn how to code a few image processing methods using x86-64 assembly language and AVX. The first source code example illustrates using AVX instructions to find the minimum and maximum values in a pixel buffer. The second source code example describes how to calculate a pixel buffer mean. Note that the AVX instructions and SIMD processing computations demonstrated in this section are also appropriate for use in other functions that carry out calculations using arrays or matrices of integer elements.

Pixel Minimum and Maximum

Listing 13-5 shows the x86-64 assembly language source code for example Ch13_05. This example, which is a modified version of example Ch02_06, finds the minimum and maximum value in a pixel buffer of 8-bit unsigned integers. The non-assembly language source code for example Ch13_05 is not shown in Listing 13-5 since it is almost identical to the code you saw in example Ch02_06. However, the complete code for example Ch13_05 is included in the download software package.
;-------------------------------------------------
;               Ch13_05_fasm.asm
;-------------------------------------------------
;--------------------------------------------------------------------------
; extern "C" bool CalcMinMaxU8_Aavx(uint8_t* x_min, uint8_t* x_max,
;   const uint8_t* x, size_t n);
;--------------------------------------------------------------------------
NSE     equ 16                              ;num_simd_elements
        .code
CalcMinMaxU8_Aavx proc
; Make sure n and x are valid
        test r9,r9                          ;is n == 0?
        jz BadArg                           ;jump if yes
        test r9,0fh                         ;is n even multiple of 16?
        jnz BadArg                          ;jump if no
        test r8,0fh                         ;is x aligned to 16b boundary?
        jnz BadArg                          ;jump if no
; Initialize packed min and max values
        vpcmpeqb xmm4,xmm4,xmm4             ;packed minimums (all 0xff)
        vpxor xmm5,xmm5,xmm5                ;packed maximums (all 0x00)
        mov rax,-NSE                        ;initialize i
Loop1:  add rax,NSE                         ;i += NSE
        cmp rax,r9                          ;is i >= n?
        jae @F                              ;jump if yes
        vmovdqa xmm0,xmmword ptr [r8+rax]   ;xmm0 = block of 16 pixels
        vpminub xmm4,xmm4,xmm0              ;update packed min values
        vpmaxub xmm5,xmm5,xmm0              ;update packed max values
        jmp Loop1                           ;repeat until done
; Reduce packed min values
@@:     vpsrldq xmm0,xmm4,8
        vpminub xmm0,xmm0,xmm4              ;xmm0[63:0] = final 8 min vals
        vpsrldq xmm1,xmm0,4
        vpminub xmm1,xmm0,xmm1              ;xmm1[31:0] = final 4 min vals
        vpsrldq xmm2,xmm1,2
        vpminub xmm2,xmm2,xmm1              ;xmm2[15:0] = final 2 min vals
        vpsrldq xmm3,xmm2,1
        vpminub xmm3,xmm3,xmm2              ;xmm3[7:0] = final min val
        vpextrb byte ptr [rcx],xmm3,0       ;save final min val
; Reduce packed max values
        vpsrldq xmm0,xmm5,8
        vpmaxub xmm0,xmm0,xmm5              ;xmm0[63:0] = final 8 max vals
        vpsrldq xmm1,xmm0,4
        vpmaxub xmm1,xmm0,xmm1              ;xmm1[31:0] = final 4 max vals
        vpsrldq xmm2,xmm1,2
        vpmaxub xmm2,xmm2,xmm1              ;xmm2[15:0] = final 2 max vals
        vpsrldq xmm3,xmm2,1
        vpmaxub xmm3,xmm3,xmm2              ;xmm3[7:0] = final max val
        vpextrb byte ptr [rdx],xmm3,0       ;save final max val
        mov eax,1                           ;set success return code
        ret
BadArg: xor eax,eax                         ;set error return code
        ret
CalcMinMaxU8_Aavx endp
        end
Listing 13-5

Example Ch13_05

Near the top of Listing 13-5, function CalcMinMaxU8_Aavx() employs two test instructions to confirm that argument value n is not equal to zero and an integral multiple of 16. The third test instruction verifies that pixel buffer x is aligned on a 16-byte boundary. Following argument validation, CalcMinMaxU8_Aavx() uses a vpcmpeqb xmm4,xmm4,xmm4 (Compare Packed Data for Equal) instruction to load 0xFF into each byte element of register XMM4. More specifically, vpcmpeqb performs byte element compares using its two source operands and sets the corresponding byte element in the destination operand to 0xFF if source operand elements are equal. Function CalcMinMaxU8_Aavx() uses vpcmpeqb xmm4,xmm4,xmm4 to set each byte element of XMM4 to 0xFF since this is faster than using vmovdqa instruction to load a 128-bit constant of all ones from memory. The ensuing vpxor xmm5,xmm5,xmm5 instruction sets each byte element in register XMM5 to 0x00.

The next instruction, mov rax,-NSE, initializes loop index variable i. Register RAX is loaded with -NSE since each iteration of Loop1 begins with an add RAX,NSE instruction that calculates i += NSE. This is followed by the instruction pair cmp rax,r9 and jae @F, which terminates Loop1 when i >= n is true. Note that the order of instructions used to initialize and update i in Loop1 precludes a loop-carried dependency condition from occurring. A loop-carried dependency condition arises when calculations in a for-loop are dependent on values computed during a prior iteration. Having a loop-carried dependency in a for-loop sometimes results in slower performance. A for-loop sans any loop-carried dependencies provides better opportunities for the processor to perform calculations of successive iterations simultaneously.

During execution of Loop1, function CalcMinMaxU8_Aavx() maintains packed minimums and maximums in registers XMM4 and XMM5, respectively. The first AVX instruction of Loop1, vmovdqa xmm0,xmmword ptr [r8+rax], loads a block of 16 pixels (x[i:i+15]) into register XMM0. This is followed by a vpminub xmm4,xmm4,xmm0 (Minimum of Packed Unsigned Integers) instruction that updates the packed minimum pixel values in XMM4. The ensuing vpmaxub xmm5,xmm5,xmm0 (Maximum of Packed Unsigned Integers) instruction updates the packed maximum values in XMM5.

Following execution of Loop1, CalcMinMaxU8_Aavx() reduces the 16 minimum values in XMM4 to a single scalar value. The code block that performs this operation uses a series of vpsrldq and vpminub instructions as shown in Figure 13-3.
Figure 13-3

Reduction of packed 8-bit unsigned integers using vpsrldq and vpminub

The last instruction in the pixel minimum reduction code block, vpextrb byte ptr [rcx],xmm3,0 (Extract Byte), copies the pixel minimum value in XMM3[7:0] to the buffer pointed to by x_min. The ensuing code block in function CalcMinMaxU8_Aavx() uses the same reduction technique to compute the final pixel maximum value. Note that this code block employs vpmaxub instead of vpminub . Here are the results for source code example Ch13_05:
Results for CalcMinMaxU8_Cpp
rc0: 1  x_min0: 2  x_max0: 254
Results for CalcMinMaxU8_Aavx
rc1: 1  x_min1: 2  x_max1: 254
Running benchmark function CalcMinMaxU8_bm - please wait
Benchmark times save to file Ch13_05_CalcMinMaxU8_bm_OXYGEN4.csv
Table 13-1 shows the benchmark timing measurements for source code example Ch13_05. Like the example you saw in Chapter 2, the assembly language implementation of the pixel minimum-maximum algorithm clearly outperforms the C++ coded algorithm by a wide margin. The right-most column in Table 13-1 contains the benchmark timing measurements from Table 2-3 for the function CalcMinMaxU8_Iavx(), which used C++ SIMD intrinsic functions to calculate pixel minimums and maximums. As you can see, these numbers are similar to the assembly language function CalcMinMaxU8_Aavx().
Table 13-1

Pixel Minimum and Maximum Execution Times (Microseconds), 10,000,000 Pixels

CPU

CalcMinMaxU8_Cpp()

CalcMinMaxU8_Aavx()

CalcMinMaxU8_Iavx()

Intel Core i7-8700K

6760

388

406

Intel Core i5-11600K

7045

314

304

As a reminder, it is important to keep in mind that the benchmark timing measurements reported in this and subsequent chapters are intended to provide some helpful insights regarding potential performance gains of an x86-AVX assembly language coded function compared to one coded using standard C++ statements. It is also important to reiterate that this book is an introductory primer about x86 SIMD programming and not benchmarking. Many of the x86 SIMD calculating functions, both C++ and assembly language, are coded to hasten learning and yield significant but not necessarily optimal performance. Chapter 2 contains additional information about the benchmark timing measurements published in this book.

Pixel Mean Intensity

In source code example Ch02_07, you learned how to calculate the arithmetic mean of an array of 8-bit unsigned integers using C++ SIMD intrinsic functions. The final source example of this chapter, named Ch13_06, illustrates how to perform this same calculation using x86-64 assembly language and AVX instructions. Listing 13-6 shows the assembly language source code for example Ch13_06.
;-------------------------------------------------
;               Ch13_06_fasm.asm
;-------------------------------------------------
;--------------------------------------------------------------------------
; extern ā€œCā€ bool CalcMeanU8_Aavx(double* mean_x, uint64_t* sum_x, const uint8_t* x,
;   size_t n);
;--------------------------------------------------------------------------
NSE     equ 64                              ;num_simd_elements
        .code
        extern g_NumElementsMax:qword
CalcMeanU8_Aavx proc
; Make sure n and x are valid
        test r9,r9                          ;is n == 0?
        jz BadArg                           ;jump if yes
        cmp r9,[g_NumElementsMax]           ;is n > g_NumElementsMax?
        ja BadArg                           ;jump if yes
        test r9,3fh                         ;is n even multiple of 64?
        jnz BadArg                          ;jump if no
        test r8,0fh                         ;is x aligned to 16b boundary?
        jnz BadArg                          ;jump if no
; Initialize
        vpxor xmm4,xmm4,xmm4                ;packed zero
        vpxor xmm5,xmm5,xmm5                ;packed sums (4 dwords)
        mov rax,-NSE                        ;initialize i
; Calculate sum of all pixels
Loop1:  add rax,NSE                         ;i += NSE
        cmp rax,r9                          ;is i >= n?
        jae @F                              ;jump if yes
        vpxor xmm3,xmm3,xmm3                ;loop packed sums (8 words)
        vmovdqa xmm0,xmmword ptr [r8+rax]   ;load block of 16 pixels
        vpunpcklbw xmm1,xmm0,xmm4           ;promote bytes to words
        vpunpckhbw xmm2,xmm0,xmm4
        vpaddw xmm3,xmm3,xmm1               ;update loop packed sums
        vpaddw xmm3,xmm3,xmm2
        vmovdqa xmm0,xmmword ptr [r8+rax+16]    ;load block of 16 pixels
        vpunpcklbw xmm1,xmm0,xmm4               ;promote bytes to words
        vpunpckhbw xmm2,xmm0,xmm4
        vpaddw xmm3,xmm3,xmm1                   ;update loop packed sums
        vpaddw xmm3,xmm3,xmm2
        vmovdqa xmm0,xmmword ptr [r8+rax+32]    ;load block of 16 pixels
        vpunpcklbw xmm1,xmm0,xmm4               ;promote bytes to words
        vpunpckhbw xmm2,xmm0,xmm4
        vpaddw xmm3,xmm3,xmm1                   ;update loop packed sums
        vpaddw xmm3,xmm3,xmm2
        vmovdqa xmm0,xmmword ptr [r8+rax+48]    ;load block of 16 pixels
        vpunpcklbw xmm1,xmm0,xmm4               ;promote bytes to words
        vpunpckhbw xmm2,xmm0,xmm4
        vpaddw xmm3,xmm3,xmm1                   ;update loop packed sums
        vpaddw xmm3,xmm3,xmm2
        vpunpcklwd xmm0,xmm3,xmm4               ;promote loop packed sums
        vpunpckhwd xmm1,xmm3,xmm4
        vpaddd xmm5,xmm5,xmm0                   ;update packed dword sums
        vpaddd xmm5,xmm5,xmm1
        jmp Loop1                               ;repeat until done
; Reduce packed sums (4 dwords) to single qword
@@:     vpextrd eax,xmm5,0                  ;rax = xmm5[31:0]
        vpextrd r10d,xmm5,1                 ;r10 = xmm5[63:32]
        add rax,r10                         ;update qword sum
        vpextrd r10d,xmm5,2                 ;r10 = xmm5[95:64]
        add rax,r10                         ;update qword sum
        vpextrd r10d,xmm5,3                 ;r10 = xmm5[127:96]
        add rax,r10                         ;update qword sum
        mov qword ptr [rdx],rax             ;save final qword sum
; Calculate mean
        vcvtsi2sd xmm0,xmm0,rax             ;convert sum to DPFP
        vcvtsi2sd xmm1,xmm1,r9              ;convert n to DPFP
        vdivsd xmm2,xmm0,xmm1               ;mean = sum / n
        vmovsd real8 ptr [rcx],xmm2         ;save mean
        mov eax,1                           ;set success return code
        ret
BadArg: xor eax,eax                         ;set error return code
        ret
CalcMeanU8_Aavx endp
        end
Listing 13-6

Example Ch13_06

Near the top of file Ch13_06_fasm.asm is the statement extern g_NumElementsMax:qword, which declares g_NumElementsMax as an external quadword variable (the definition of g_NumElementsMax is located in the file Ch13_06_Misc.cpp). The first code block of CalcMeanU8_Aavx() uses a test r9,r9 and jz BadArg to ensure that argument value n is not equal to zero. The next instruction pair, cmp r9,[g_NumElementsMax] and ja BadArg, bypasses the calculating code if n > g_NumElementsMax is true. This is followed by the instruction pair test r9,3fh and jnz BadArg, which confirms that n is an even multiple of 64 (in later examples, you will learn how to process residual pixels). The final check of the first code block, test r8,0fh and jnz BadArg, confirms that pixel buffer x is aligned on a 16-byte boundary.

Following argument validation, CalcMeanU8_Aavx() sums the elements of pixel buffer x using SIMD arithmetic. The technique used in for-loop Loop1 is identical to the one used in source code example Ch02_07 and begins with a vpxor xmm3,xmm3,xmm3 instruction that initializes eight word sums to zero. The vmovdqa xmm0,xmmword ptr [r8+rax] instruction that follows loads pixel values x[i:i+15] into register XMM0. The ensuing instruction pair, vpunpcklbw xmm1,xmm0,xmm4 and vpunpckhbw xmm2,xmm0,xmm4, size-promotes the pixel values to 16 bits. These values are then added to the intermediate word sums in register XMM3 using the instructions vpaddw xmm3,xmm3,xmm1 and vpaddw xmm3,xmm3,xmm2 as shown in Figure 13-4. The next three code blocks in Loop1 add pixel values x[i+16:i+31], x[i+32:i+47], and x[i+48:i+63] to the intermediate sums in register XMM3. The final code block in Loop1 employs the instruction pair vpunpcklwd xmm0,xmm3,xmm4 and vpunpckhwd xmm1,xmm3,xmm4 to size-promote the packed word sums in XMM3 to doublewords. It then exercises two vpaddd instructions to add the current iteration sums to the intermediate packed doubleword sums maintained in register XMM5.
Figure 13-4

Summing of pixel values using vpunpcklbw, vpunpckhbw , and vpaddw in Loop1 of CalcMeanU8_Aavx()

Following execution of Loop1, CalcMeanU8_Aavx() uses a series of vpextrd and add instructions to reduce the packed doubleword sums to a single quadword value. Function CalcMeanU8_Aavx() then employs the instruction vcvtsi2sd xmm0,xmm0,rax to convert sum to a double-precision floating-point value. The next instruction, vcvtsi2sd xmm1,xmm1,r9, converts n to double-precision floating-point. This is followed by a vdivsd xmm2,xmm0,xmm1 instruction that calculates the mean. The ensuing vmovsd real8 ptr [rcx],xmm2 instruction saves the calculated mean. Here are the results for source code example Ch13_06:
Results for CalcMeanU8_Cpp
rc0: 1  sum_x0: 1275046509  mean_x0: 127.504651
Results for CalcMeanU8_Aavx
rc1: 1  sum_x1: 1275046509  mean_x1: 127.504651
Running benchmark function CalcMeanU8_bm - please wait
Benchmark times save to file Ch13_06_CalcMeanU8_bm_OXYGEN4.csv
Table 13-2 shows some benchmark timing measurements for source code example Ch13_05. This table also shows the timing measurements from Table 2-4 for CalcMeanU8_Iavx(), which used C++ SIMD intrinsic functions. Like the previous example, the benchmark timing measurements for the assembly language and C++ SIMD intrinsic function implementations are similar.
Table 13-2

Pixel Array Arithmetic Mean Execution Times (Microseconds), 10,000,000 Pixels

CPU

CalcMeanU8_Cpp()

CalcMeanU8_Aavx()

CalcMeanU8_Iavx()

Intel Core i7-8700K

2289

461

462

Intel Core i5-11600K

1856

301

288

Summary

Table 13-3 summarizes the x86 assembly language instructions introduced in this chapter. This table also includes closely related instructions. Before proceeding to the next chapter, make sure you understand the operation that is performed by each instruction shown in Table 13-3.
Table 13-3

X86 Assembly Language Instruction Summary for Chapter 13

Instruction Mnemonic

Description

vmov[d|q]

Move doubleword or quadword into XMM register

vpadd[b|w|d|q]

Packed integer addition

vpadds[b|w]

Packed signed integer addition (saturated)

vpaddus[b|w]

Packed unsigned integer addition (saturated)

vpand

Bitwise logical AND

vpcmpeq[b|w|d|q]

Packed integer compare for equality

vpextr[b|w|d|q]

Extract integer

vpmaxs[b|w|d|q]

Packed signed integer maximum

vpmaxu[b|w|d|q]

Packed unsigned integer maximum

vpmins[b|w|d|q]

Packed signed integer minimum

vpminu[b|w|d|q]

Packed unsigned integer minimum

vpmuldq

Packed signed integer multiplication (quadword results)

vpmulhw

Packed signed integer multiplication (high results)

vpmull[w|d|q]

Packed signed integer multiplication (low results)

vpor

Bitwise logical OR

vpsll[w|d|q]

Packed integer shift left logical

vpsra[w|d|q]

Packed integer shift right arithmetic

vpsrl[w|d|q]

Packed integer shift right logical

vpslldq

Shift double quadword left logical

vpsrldq

Shift double quadword right logical

vpsub[b|w|d|q]

Packed integer subtraction

vpsubs[b|w]

Packed signed integer subtraction (saturated)

vpsubus[b|w]

Packed unsigned integer subtraction (saturated)

vpunpckh[bw|wd|dq|qdq]

Unpack and interleave high-order integers

vpunpckl[bw|wd|dq|qdq]

Unpack and interleave low-order integers

vpxor

Bitwise logical exclusive OR

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.139.79.59