© Daniel Kusswurm 2018
Daniel KusswurmModern X86 Assembly Language Programminghttps://doi.org/10.1007/978-1-4842-4063-2_6

6. AVX Programming – Packed Floating-Point

Daniel Kusswurm1 
(1)
Geneva, IL, USA
 

The source code examples of the previous chapter elucidated the fundamentals of AVX programming using scalar floating-point arithmetic. In this chapter, you’ll learn how to use the AVX instruction set to perform operations using packed floating-point operands. The chapter begins with three source code examples that demonstrate common packed floating-point operations, including basic arithmetic, data comparisons, and data conversions. The next set of source code examples illustrate how to carry out SIMD computations using floating-point arrays. The final two source code examples explain how to use the AVX instruction set to accelerate matrix transposition and multiplication.

In Chapter 4 you learned that AVX supports packed floating-point operations using either 128-bit or 256-bit wide operands. All of the source code examples in this chapter use 128-bit wide packed floating-point operands, both single-precision and double-precision, and the XMM register set. You’ll learn how to use 256-bit wide packed floating-point operands and the YMM register set in Chapter 9.

Packed Floating-Point Arithmetic

Listing 6-1 shows the source code for example Ch06_01, which demonstrates how to perform common arithmetic operations using packed single-precision and double-precision floating-point operands. It also highlights proper alignment techniques for packed floating-point operands in memory.
//------------------------------------------------
//        XmmVal.h
//------------------------------------------------
#pragma once
#include <string>
#include <cstdint>
#include <sstream>
#include <iomanip>
struct XmmVal
{
public:
  union
  {
    int8_t m_I8[16];
    int16_t m_I16[8];
    int32_t m_I32[4];
    int64_t m_I64[2];
    uint8_t m_U8[16];
    uint16_t m_U16[8];
    uint32_t m_U32[4];
    uint64_t m_U64[2];
    float m_F32[4];
    double m_F64[2];
  };
//------------------------------------------------
//        Ch06_01.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include "XmmVal.h"
using namespace std;
extern "C" void AvxPackedMathF32_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
extern "C" void AvxPackedMathF64_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
void AvxPackedMathF32(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[8];
  a.m_F32[0] = 36.0f;         b.m_F32[0] = -(float)(1.0 / 9.0);
  a.m_F32[1] = (float)(1.0 / 32.0);  b.m_F32[1] = 64.0f;
  a.m_F32[2] = 2.0f;         b.m_F32[2] = -0.0625f;
  a.m_F32[3] = 42.0f;         b.m_F32[3] = 8.666667f;
  AvxPackedMathF32_(a, b, c);
  cout << (" Results for AvxPackedMathF32 ");
  cout << "a:    " << a.ToStringF32() << ' ';
  cout << "b:    " << b.ToStringF32() << ' ';
  cout << ' ';
  cout << "addps:  " << c[0].ToStringF32() << ' ';
  cout << "subps:  " << c[1].ToStringF32() << ' ';
  cout << "mulps:  " << c[2].ToStringF32() << ' ';
  cout << "divps:  " << c[3].ToStringF32() << ' ';
  cout << "absps b: " << c[4].ToStringF32() << ' ';
  cout << "sqrtps a:" << c[5].ToStringF32() << ' ';
  cout << "minps:  " << c[6].ToStringF32() << ' ';
  cout << "maxps:  " << c[7].ToStringF32() << ' ';
}
void AvxPackedMathF64(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[8];
  a.m_F64[0] = 2.0;    b.m_F64[0] = M_E;
  a.m_F64[1] = M_PI;   b.m_F64[1] = -M_1_PI;
  AvxPackedMathF64_(a, b, c);
  cout << (" Results for AvxPackedMathF64 ");
  cout << "a:    " << a.ToStringF64() << ' ';
  cout << "b:    " << b.ToStringF64() << ' ';
  cout << ' ';
  cout << "addpd:  " << c[0].ToStringF64() << ' ';
  cout << "subpd:  " << c[1].ToStringF64() << ' ';
  cout << "mulpd:  " << c[2].ToStringF64() << ' ';
  cout << "divpd:  " << c[3].ToStringF64() << ' ';
  cout << "abspd b: " << c[4].ToStringF64() << ' ';
  cout << "sqrtpd a:" << c[5].ToStringF64() << ' ';
  cout << "minpd:  " << c[6].ToStringF64() << ' ';
  cout << "maxpd:  " << c[7].ToStringF64() << ' ';
}
int main()
{
  AvxPackedMathF32();
  AvxPackedMathF64();
  return 0;
}
;-------------------------------------------------
;        Ch06_01.asm
;-------------------------------------------------
      .const
      align 16
AbsMaskF32 dword 7fffffffh, 7fffffffh, 7fffffffh, 7fffffffh ;Absolute value mask for SPFP
AbsMaskF64 qword 7fffffffffffffffh, 7fffffffffffffffh    ;Absolute value mask for DPFP
; extern "C" void AvxPackedMathF32_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
      .code
AvxPackedMathF32_ proc
; Load packed SPFP values
    vmovaps xmm0,xmmword ptr [rcx]    ;xmm0 = a
    vmovaps xmm1,xmmword ptr [rdx]    ;xmm1 = b
; Packed SPFP addition
    vaddps xmm2,xmm0,xmm1
    vmovaps [r8+0],xmm2
; Packed SPFP subtraction
    vsubps xmm2,xmm0,xmm1
    vmovaps [r8+16],xmm2
; Packed SPFP multiplication
    vmulps xmm2,xmm0,xmm1
    vmovaps [r8+32],xmm2
; Packed SPFP division
    vdivps xmm2,xmm0,xmm1
    vmovaps [r8+48],xmm2
; Packed SPFP absolute value (b)
    vandps xmm2,xmm1,xmmword ptr [AbsMaskF32]
    vmovaps [r8+64],xmm2
; Packed SPFP square root (a)
    vsqrtps xmm2,xmm0
    vmovaps [r8+80],xmm2
; Packed SPFP minimum
    vminps xmm2,xmm0,xmm1
    vmovaps [r8+96],xmm2
; Packed SPFP maximum
    vmaxps xmm2,xmm0,xmm1
    vmovaps [r8+112],xmm2
    ret
AvxPackedMathF32_ endp
; extern "C" void AvxPackedMathF64_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
AvxPackedMathF64_ proc
; Load packed DPFP values
    vmovapd xmm0,xmmword ptr [rcx]    ;xmm0 = a
    vmovapd xmm1,xmmword ptr [rdx]    ;xmm1 = b
; Packed DPFP addition
    vaddpd xmm2,xmm0,xmm1
    vmovapd [r8+0],xmm2
; Packed DPFP subtraction
    vsubpd xmm2,xmm0,xmm1
    vmovapd [r8+16],xmm2
; Packed DPFP multiplication
    vmulpd xmm2,xmm0,xmm1
    vmovapd [r8+32],xmm2
; Packed DPFP division
    vdivpd xmm2,xmm0,xmm1
    vmovapd [r8+48],xmm2
; Packed DPFP absolute value (b)
    vandpd xmm2,xmm1,xmmword ptr [AbsMaskF64]
    vmovapd [r8+64],xmm2
; Packed DPFP square root (a)
    vsqrtpd xmm2,xmm0
    vmovapd [r8+80],xmm2
; Packed DPFP minimum
    vminpd xmm2,xmm0,xmm1
    vmovapd [r8+96],xmm2
; Packed DPFP maximum
    vmaxpd xmm2,xmm0,xmm1
    vmovapd [r8+112],xmm2
    ret
AvxPackedMathF64_ endp
    end
Listing 6-1.

Example Ch06_01

Listing 6-1 begins with the declaration of a C++ structure named XmmVal that’s declared in the header file XmmVal.h. This structure contains a publicly-accessible anonymous union that facilitates packed operand data exchange between functions written in C++ and x86 assembly language. The members of this union correspond to the packed data types that can be used with an XMM register. The structure XmmVal also includes several member functions that format and display the contents of an XmmVal variable (the source code for these member functions is not shown but included with the chapter download package).

Near the top of the C++ code are the declarations for the x86-64 assembly language functions AvxPackedMath32_ and AvxPackedMath64_. These functions carry out ordinary packed arithmetic operations using the supplied XmmVal argument values. Note that for both AvxPackedMath32_ and AvxPackedMath64_, arguments a and b are passed by reference instead of value in order to avoid the overhead of an XmmVal copy operation. Using pointers to pass a and b would also work in this example since pointers and references are the same from the perspective of the x86-64 assembly language functions.

Immediately following the assembly language function declarations is the definition for function AvxPackedMathF32. This function contains code that demonstrates packed single-precision floating-point arithmetic. Note that the XmmVal variables a, b, and c are all defined using the specifier alignas(16), which instructs the C++ compiler to align each variable on a 16-byte boundary. The next set of statements initializes the arrays a.m_F32 and b.m_F32 with test values. The C++ code then calls the assembly language function AvxPackedMathF32_ to perform various arithmetic operations using the packed single-precision floating-point operands. The results are then displayed using a series of stream writes to cout. The C++ code also contains a function named AvxPackedMath64 that illustrates arithmetic operations using packed double-precision floating-point operands. The organization of this function is similar to AvxPackedMath32.

The x86-64 assembly language code for example Ch06_01 begins with a .const section that defines packed mask values for calculating floating-point absolute values. The align 16 statement is a MASM directive that instructs the assembler to align the next variable (or instruction) to a 16-byte boundary. Using this statement guarantees that the mask AbsMaskF32 is properly aligned. Note that unlike x86-SSE, x86-AVX instruction operands in memory need not be properly aligned except for instructions that explicitly specify aligned operands (e.g., vmovaps). However, proper alignment of packed operands in memory is strongly recommended whenever possible in order to avoid the performance penalties that can occur when the processor accesses an unaligned operand. A second align 16 directive is not necessary to ensure alignment of AbsMaskF64 since the size of AbsMaskF32 is 16 bytes, but it would be okay to include such a statement.

The first instruction of AvxPackedMathF32_, vmovaps xmm0,xmmword ptr [rcx] loads argument a (i.e., the four floating-point values stored in XmmVal a) into register XMM0. As mentioned in the previous paragraph, the vmovaps (Move Aligned Packed Single-Precision Floating-Point Values) instruction requires source operands in memory to be properly aligned. This is why the alignas(16) specifiers were used in the C++ code. The operator xmmword ptr directs the assembler to treat the memory location pointer to by RCX as a 128-bit operand. In this instance, use of the xmmword ptr operator is optional and employed to improve code readability. The ensuing vmovaps xmm1,xmmword ptr [rdx] instruction loads b into register XMM1. The vaddps xmm2,xmm0,xmm1 instruction (Add Packed Single-Precision Floating-Point Values) performs packed single-precision floating-point addition using the contents of registers XMM0 and XMM1. It then saves the calculated sum to register XMM2, as shown in Figure 6-1. Note that the vaddps instruction does not modify the contents of its two source operands. The vmovaps xmmword ptr [r8],xmm2 that follows saves the result of the packed arithmetic addition to c[0].
../images/326959_2_En_6_Chapter/326959_2_En_6_Fig1_HTML.jpg
Figure 6-1.

Execution of vaddps instruction

The ensuing vsubps, vmulps, and vdivps instructions carry out packed single-precision floating-point subtraction, multiplication, and division. This is followed by a vandps xmm2,xmm1,xmmword ptr [AbsMaskF32] instruction that calculates packed absolute values using argument b. The vandps (Bitwise AND of Packed Single-Precision Floating-Point Values) instruction performs a bitwise AND using its two source operands. Note that all of the bits in each AbsMaskF32 doubleword are set to one except the most significant bit, which corresponds to the sign bit of a single-precision floating-point value. A sign bit value of zero corresponds to a positive floating-point number as discussed in Chapter 4. Performing a bitwise AND using this 128-bit wide mask and the packed single-precision floating-point operand b sets the sign bit of each element to zero and generates packed absolute values.

The remaining instructions in AvxPackedMathF32_ calculate packed single-precision floating-point square roots (vsqrtps), minimums (vminps), and maximums (vmaxps). The organization of function AvxPackedMathF64_ is similar to AvxPackedMathF32_. AvxPackedMathF64_ carries out its calculations using the packed double-precision floating-point versions of the same instructions that are used in AvxPackedMathF32_. Here is the output for source code example Ch06_01:
Results for AvxPackedMathF32
a:       36.000000    0.031250  |    2.000000    42.000000
b:       -0.111111    64.000000  |    -0.062500    8.666667
addps:     35.888889    64.031250  |    1.937500    50.666668
subps:     36.111111   -63.968750  |    2.062500    33.333332
mulps:     -4.000000    2.000000  |    -0.125000   364.000000
divps:    -324.000000    0.000488  |   -32.000000    4.846154
absps b:     0.111111    64.000000  |    0.062500    8.666667
sqrtps a:    6.000000    0.176777  |    1.414214    6.480741
minps:     -0.111111    0.031250  |    -0.062500    8.666667
maxps:     36.000000    64.000000  |    2.000000    42.000000
Results for AvxPackedMathF64
a:             2.000000000000  |         3.141592653590
b:             2.718281828459  |         -0.318309886184
addpd:           4.718281828459  |         2.823282767406
subpd:          -0.718281828459  |         3.459902539774
mulpd:           5.436563656918  |         -1.000000000000
divpd:           0.735758882343  |         -9.869604401089
abspd b:          2.718281828459  |         0.318309886184
sqrtpd a:         1.414213562373  |         1.772453850906
minpd:           2.000000000000  |         -0.318309886184
maxpd:           2.718281828459  |         3.141592653590

Packed Floating-Point Compares

In Chapter 5, you learned how to compare scalar single-precision and double-precision floating-point values using the vcmps[d|s] instructions. In this section, you’ll learn how to compare packed single-precision and double-precision floating-point values using the vcmpp[d|s] instructions. Similar to their scalar counterparts, the packed compare instructions require four operands: a destination operand, two source operands, and an immediate compare predicate. The packed compare instructions signify their results using quadword (vcmppd) or doubleword (vcmpps) masks of all zeros (false compare result) or all ones (true compare result). Listing 6-2 shows the source code for example Ch06_02.
//------------------------------------------------
//        Ch06_02.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#define _USE_MATH_DEFINES
#include <math.h>
#include <limits>
#include "XmmVal.h"
using namespace std;
extern "C" void AvxPackedCompareF32_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
extern "C" void AvxPackedCompareF64_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
const char* c_CmpStr[8] =
{
  "EQ", "NE", "LT", "LE", "GT", "GE", "ORDERED", "UNORDERED"
};
void AvxPackedCompareF32(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[8];
  a.m_F32[0] = 2.0;     b.m_F32[0] = 1.0;
  a.m_F32[1] = 7.0;     b.m_F32[1] = 12.0;
  a.m_F32[2] = -6.0;    b.m_F32[2] = -6.0;
  a.m_F32[3] = 3.0;     b.m_F32[3] = 8.0;
  for (int i = 0; i < 2; i++)
  {
    if (i == 1)
      a.m_F32[0] = numeric_limits<float>::quiet_NaN();
    AvxPackedCompareF32_(a, b, c);
    cout << " Results for AvxPackedCompareF32 (iteration = " << i << ") ";
    cout << setw(11) << 'a' << ':' << a.ToStringF32() << ' ';
    cout << setw(11) << 'b' << ':' << b.ToStringF32() << ' ';
    cout << ' ';
    for (int j = 0; j < 8; j++)
      cout << setw(11) << c_CmpStr[j] << ':' << c[j].ToStringX32() << ' ';
  }
}
void AvxPackedCompareF64(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[8];
  a.m_F64[0] = 2.0;    b.m_F64[0] = M_E;
  a.m_F64[1] = M_PI;   b.m_F64[1] = -M_1_PI;
  for (int i = 0; i < 2; i++)
  {
    if (i == 1)
    {
      a.m_F64[0] = numeric_limits<double>::quiet_NaN();
      b.m_F64[1] = a.m_F64[1];
    }
    AvxPackedCompareF64_(a, b, c);
    cout << " Results for AvxPackedCompareF64 (iteration = " << i << ") ";
    cout << setw(11) << 'a' << ':' << a.ToStringF64() << ' ';
    cout << setw(11) << 'b' << ':' << b.ToStringF64() << ' ';
    cout << ' ';
    for (int j = 0; j < 8; j++)
      cout << setw(11) << c_CmpStr[j] << ':' << c[j].ToStringX64() << ' ';
  }
}
int main()
{
  AvxPackedCompareF32();
  AvxPackedCompareF64();
  return 0;
}
;-------------------------------------------------
;        Ch06_02.asm
;-------------------------------------------------
    include <cmpequ.asmh>
; extern "C" void AvxPackedCompareF32_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
    .code
AvxPackedCompareF32_ proc
    vmovaps xmm0,[rcx]          ;xmm0 = a
    vmovaps xmm1,[rdx]          ;xmm1 = b
; Perform packed EQUAL compare
    vcmpps xmm2,xmm0,xmm1,CMP_EQ
    vmovdqa xmmword ptr [r8],xmm2
; Perform packed NOT EQUAL compare
    vcmpps xmm2,xmm0,xmm1,CMP_NEQ
    vmovdqa xmmword ptr [r8+16],xmm2
; Perform packed LESS THAN compare
    vcmpps xmm2,xmm0,xmm1,CMP_LT
    vmovdqa xmmword ptr [r8+32],xmm2
; Perform packed LESS THAN OR EQUAL compare
    vcmpps xmm2,xmm0,xmm1,CMP_LE
    vmovdqa xmmword ptr [r8+48],xmm2
 ; Perform packed GREATER THAN compare
    vcmpps xmm2,xmm0,xmm1,CMP_GT
    vmovdqa xmmword ptr [r8+64],xmm2
; Perform packed GREATER THAN OR EQUAL compare
    vcmpps xmm2,xmm0,xmm1,CMP_GE
    vmovdqa xmmword ptr [r8+80],xmm2
; Perform packed ORDERED compare
    vcmpps xmm2,xmm0,xmm1,CMP_ORD
    vmovdqa xmmword ptr [r8+96],xmm2
; Perform packed UNORDERED compare
    vcmpps xmm2,xmm0,xmm1,CMP_UNORD
    vmovdqa xmmword ptr [r8+112],xmm2
    ret
AvxPackedCompareF32_ endp
; extern "C" void AvxPackedCompareF64_(const XmmVal& a, const XmmVal& b, XmmVal c[8]);
AvxPackedCompareF64_ proc
    vmovapd xmm0,[rcx]          ;xmm0 = a
    vmovapd xmm1,[rdx]          ;xmm1 = b
; Perform packed EQUAL compare
    vcmppd xmm2,xmm0,xmm1,CMP_EQ
    vmovdqa xmmword ptr [r8],xmm2
; Perform packed NOT EQUAL compare
    vcmppd xmm2,xmm0,xmm1,CMP_NEQ
    vmovdqa xmmword ptr [r8+16],xmm2
; Perform packed LESS THAN compare
    vcmppd xmm2,xmm0,xmm1,CMP_LT
    vmovdqa xmmword ptr [r8+32],xmm2
; Perform packed LESS THAN OR EQUAL compare
    vcmppd xmm2,xmm0,xmm1,CMP_LE
    vmovdqa xmmword ptr [r8+48],xmm2
 ; Perform packed GREATER THAN compare
    vcmppd xmm2,xmm0,xmm1,CMP_GT
    vmovdqa xmmword ptr [r8+64],xmm2
; Perform packed GREATER THAN OR EQUAL compare
    vcmppd xmm2,xmm0,xmm1,CMP_GE
    vmovdqa xmmword ptr [r8+80],xmm2
; Perform packed ORDERED compare
    vcmppd xmm2,xmm0,xmm1,CMP_ORD
    vmovdqa xmmword ptr [r8+96],xmm2
; Perform packed UNORDERED compare
    vcmppd xmm2,xmm0,xmm1,CMP_UNORD
    vmovdqa xmmword ptr [r8+112],xmm2
    ret
AvxPackedCompareF64_ endp
    end
Listing 6-2.

Example Ch06_02

Figure 6-2 illustrates execution of the vcmpps xmm2,xmm0,xmm1,0 and vcmppd xmm2,xmm0,xmm1,1 instructions. In these examples, the compare predicate operands 0 and 1 test for equality and less than, respectively.
../images/326959_2_En_6_Chapter/326959_2_En_6_Fig2_HTML.jpg
Figure 6-2.

Execution of the vcmpps and vcmppd instructions

The C++ function AvxPackedCompareF32 begins by initializing a couple of XmmVal test variables. Similar to the example that you saw in the previous section, the alignas(16) specifier is used with each XmmVal variable to force proper alignment to a 16-byte boundary. The remaining code in this function invokes the assembly language AvxPackedCompareF32_ and displays the results. Note that on the second iteration of the for loop, the constant numeric_limits<float>::quiet_NaN() is substituted for one of the values in XmmVal a to exemplify operation of the ordered and unordered compare predicates. An ordered compare is true when both operands are valid values. An unordered compare is true when one or both of the operands is a NaN or erroneously encoded. Substituting numeric_limits<float>::quiet_NaN() for one of the values in XmmVal a generates a true result for an unordered compare. The C++ code also includes the function AvxPackedCompareF64, which is the double-precision counterpart of AvxPackedCompareF32.

The x86-64 assembly language code begins with an include <cmpequ.asmh> statement. This file, which also was used in example Ch05_05, contains compare predicate equates that will be used in this source code example with the vcmpp[d|s] instructions. The assembly language function AvxPackeCompareF32_ starts with two vmovaps instructions that load arguments a and b into registers XMM0 and XMM1, respectively. The ensuing vcmpps xmm2,xmm0,xmm1,CMP_EQ instruction compares packed operands a and b for equality and saves the packed result (four doubleword mask values) to register XMM2. The contents of register XMM2 are then saved to the result array c using a vmovdqa xmmword ptr [r8],xmm2 instruction. The remaining code in AvxPackedCompareF32_ performs additional compare operations using recognizable compare predicates. The assembly language function AvxPackedCompareF64_ demonstrates how to use the vcmppd instruction to carry out packed double-precision floating-point compares. Here is the output for example Ch06_02:
Results for AvxPackedCompareF32 (iteration = 0)
     a:    2.000000    7.000000  |    -6.000000    3.000000
     b:    1.000000    12.000000  |    -6.000000    8.000000
     EQ:    00000000    00000000  |    FFFFFFFF    00000000
     NE:    FFFFFFFF    FFFFFFFF  |    00000000    FFFFFFFF
     LT:    00000000    FFFFFFFF  |    00000000    FFFFFFFF
     LE:    00000000    FFFFFFFF  |    FFFFFFFF    FFFFFFFF
     GT:    FFFFFFFF    00000000  |    00000000    00000000
     GE:    FFFFFFFF    00000000  |    FFFFFFFF    00000000
  ORDERED:    FFFFFFFF    FFFFFFFF  |    FFFFFFFF    FFFFFFFF
 UNORDERED:    00000000    00000000  |    00000000    00000000
Results for AvxPackedCompareF32 (iteration = 1)
     a:       nan    7.000000  |    -6.000000    3.000000
     b:    1.000000    12.000000  |    -6.000000    8.000000
     EQ:    00000000    00000000  |    FFFFFFFF    00000000
     NE:    FFFFFFFF    FFFFFFFF  |    00000000    FFFFFFFF
     LT:    00000000    FFFFFFFF  |    00000000    FFFFFFFF
     LE:    00000000    FFFFFFFF  |    FFFFFFFF    FFFFFFFF
     GT:    00000000    00000000  |    00000000    00000000
     GE:    00000000    00000000  |    FFFFFFFF    00000000
  ORDERED:    00000000    FFFFFFFF  |    FFFFFFFF    FFFFFFFF
 UNORDERED:    FFFFFFFF    00000000  |    00000000    00000000
Results for AvxPackedCompareF64 (iteration = 0)
     a:         2.000000000000  |         3.141592653590
     b:         2.718281828459  |         -0.318309886184
     EQ:        0000000000000000  |        0000000000000000
     NE:        FFFFFFFFFFFFFFFF  |        FFFFFFFFFFFFFFFF
     LT:        FFFFFFFFFFFFFFFF  |        0000000000000000
     LE:        FFFFFFFFFFFFFFFF  |        0000000000000000
     GT:        0000000000000000  |        FFFFFFFFFFFFFFFF
     GE:        0000000000000000  |        FFFFFFFFFFFFFFFF
  ORDERED:        FFFFFFFFFFFFFFFF  |        FFFFFFFFFFFFFFFF
 UNORDERED:        0000000000000000  |        0000000000000000
Results for AvxPackedCompareF64 (iteration = 1)
     a:               nan  |         3.141592653590
     b:         2.718281828459  |         3.141592653590
     EQ:        0000000000000000  |        FFFFFFFFFFFFFFFF
     NE:        FFFFFFFFFFFFFFFF  |        0000000000000000
     LT:        0000000000000000  |        0000000000000000
     LE:        0000000000000000  |        FFFFFFFFFFFFFFFF
     GT:        0000000000000000  |        0000000000000000
     GE:        0000000000000000  |        FFFFFFFFFFFFFFFF
  ORDERED:        0000000000000000  |        FFFFFFFFFFFFFFFF
 UNORDERED:        FFFFFFFFFFFFFFFF  |        0000000000000000

Packed Floating-Point Conversions

The next source code example is named Ch06_03. This example shows packed signed doubleword integers to floating-point conversions and vice versa. It also illustrates conversions between packed single-precision and packed double-precision floating-point values. Listing 6-3 shows the source code for example Ch06_03.
//------------------------------------------------
//        Ch06_03.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#define _USE_MATH_DEFINES
#include <math.h>
#include "XmmVal.h"
using namespace std;
// The order of values in the following enum must match the jump table
// that's defined in Ch06_03_.asm.
enum CvtOp : unsigned int
{
  I32_F32, F32_I32, I32_F64, F64_I32, F32_F64, F64_F32,
};
extern "C" bool AvxPackedConvertFP_(const XmmVal& a, XmmVal& b, CvtOp cvt_op);
void AvxPackedConvertF32(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  a.m_I32[0] = 10;
  a.m_I32[1] = -500;
  a.m_I32[2] = 600;
  a.m_I32[3] = -1024;
  AvxPackedConvertFP_(a, b, CvtOp::I32_F32);
  cout << " Results for CvtOp::I32_F32 ";
  cout << "a: " << a.ToStringI32() << ' ';
  cout << "b: " << b.ToStringF32() << ' ';
  a.m_F32[0] = 1.0f / 3.0f;
  a.m_F32[1] = 2.0f / 3.0f;
  a.m_F32[2] = -a.m_F32[0] * 2.0f;
  a.m_F32[3] = -a.m_F32[1] * 2.0f;
  AvxPackedConvertFP_(a, b, CvtOp::F32_I32);
  cout << " Results for CvtOp::F32_I32 ";
  cout << "a: " << a.ToStringF32() << ' ';
  cout << "b: " << b.ToStringI32() << ' ';
  // F32_F64 converts the two low-order SPFP values of 'a'
  a.m_F32[0] = 1.0f / 7.0f;
  a.m_F32[1] = 2.0f / 9.0f;
  a.m_F32[2] = 0;
  a.m_F32[3] = 0;
  AvxPackedConvertFP_(a, b, CvtOp::F32_F64);
  cout << " Results for CvtOp::F32_F64 ";
  cout << "a: " << a.ToStringF32() << ' ';
  cout << "b: " << b.ToStringF64() << ' ';
}
void AvxPackedConvertF64(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  // I32_F64 converts the two low-order doubleword integers of 'a'
  a.m_I32[0] = 10;
  a.m_I32[1] = -20;
  a.m_I32[2] = 0;
  a.m_I32[3] = 0;
  AvxPackedConvertFP_(a, b, CvtOp::I32_F64);
  cout << " Results for CvtOp::I32_F64 ";
  cout << "a: " << a.ToStringI32() << ' ';
  cout << "b: " << b.ToStringF64() << ' ';
  // F64_I32 sets the two high-order doublewords of 'b' to zero
  a.m_F64[0] = M_PI;
  a.m_F64[1] = M_E;
  AvxPackedConvertFP_(a, b, CvtOp::F64_I32);
  cout << " Results for CvtOp::F64_I32 ";
  cout << "a: " << a.ToStringF64() << ' ';
  cout << "b: " << b.ToStringI32() << ' ';
  // F64_F32 sets the two high-order SPFP values of 'b' to zero
  a.m_F64[0] = M_SQRT2;
  a.m_F64[1] = M_SQRT1_2;
  AvxPackedConvertFP_(a, b, CvtOp::F64_F32);
  cout << " Results for CvtOp::F64_F32 ";
  cout << "a: " << a.ToStringF64() << ' ';
  cout << "b: " << b.ToStringF32() << ' ';
}
int main()
{
  AvxPackedConvertF32();
  AvxPackedConvertF64();
  return 0;
}
;-------------------------------------------------
;        Ch06_03.asm
;-------------------------------------------------
; extern "C" bool AvxPackedConvertFP_(const XmmVal& a, XmmVal& b, CvtOp cvt_op);
;
; Note:     This function requires linker option /LARGEADDRESSAWARE:NO
;        to be explicitly set.
    .code
AvxPackedConvertFP_ proc
; Make sure cvt_op is valid
    mov r9d,r8d             ;r9 = cvt_op (zero extended)
    cmp r9,CvtOpTableCount       ;is cvt_op valid?
    jae InvalidCvtOp          ;jmp if cvt_op is invalid
    mov eax,1              ;set valid cvt_op return code
    jmp [CvtOpTable+r9*8]        ;jump to specified conversion
; Convert packed signed doubleword integers to packed SPFP values
I32_F32:
    vmovdqa xmm0,xmmword ptr [rcx]
    vcvtdq2ps xmm1,xmm0
    vmovaps xmmword ptr [rdx],xmm1
    ret
; Convert packed SPFP values to packed signed doubleword integers
F32_I32:
    vmovaps xmm0,xmmword ptr [rcx]
    vcvtps2dq xmm1,xmm0
    vmovdqa xmmword ptr [rdx],xmm1
    ret
; Convert packed signed doubleword integers to packed DPFP values
I32_F64:
    vmovdqa xmm0,xmmword ptr [rcx]
    vcvtdq2pd xmm1,xmm0
    vmovapd xmmword ptr [rdx],xmm1
    ret
; Convert packed DPFP values to packed signed doubleword integers
F64_I32:
    vmovapd xmm0,xmmword ptr [rcx]
    vcvtpd2dq xmm1,xmm0
    vmovdqa xmmword ptr [rdx],xmm1
    ret
; Convert packed SPFP to packed DPFP
F32_F64:
    vmovaps xmm0,xmmword ptr [rcx]
    vcvtps2pd xmm1,xmm0
    vmovapd xmmword ptr [rdx],xmm1
    ret
; Convert packed DPFP to packed SPFP
F64_F32:
    vmovapd xmm0,xmmword ptr [rcx]
    vcvtpd2ps xmm1,xmm0
    vmovaps xmmword ptr [rdx],xmm1
    ret
InvalidCvtOp:
    xor eax,eax             ;set invalid cvt_op return code
    ret
; The order of values in the following table must match the enum CvtOp
; that's defined in Ch06_03.cpp.
      align 8
CvtOpTable qword I32_F32, F32_I32
      qword I32_F64, F64_I32
      qword F32_F64, F64_F32
CvtOpTableCount equ ($ - CvtOpTable) / size qword
AvxPackedConvertFP_ endp
    end
Listing 6-3.

Example Ch06_03

The C++ code begins with an enum named CvtOP that defines the conversion operations supported by the assembly language function AvxPackedConvertFP_. The actual enumerator values in CvtOp are critical since the assembly language code uses them as indices into a jump table . The function that follows CvtOp, AvxPackedConvertF32, exercises some test cases using packed single-precision floating-point operands. Similarly, the function AvxPackedConvertF64 contains test cases for packed double-precision floating-point operands. As in the previous examples of this chapter, all XmmVal variable declarations in these functions use the alignas(16) specifier to ensure proper alignment.

Toward the bottom of the assembly language code in Listing 6-3 is the previously mentioned jump table. CvtOpTable contains a list of labels that are defined in the function AvxPackedConvertFP_. The target of each label is a short code block that performs a specific conversion. The equate CvtOpTableCount defines the number of items in the jump table and is used to validate the argument value cvt_op. The align 8 directive instructs the assembler to align CvtOpTable on a quadword boundary in order to avoid unaligned memory accesses when referencing elements in the table. Note that CvtOpTable is defined inside the assembly language function AvxPackedConvertFP_ (i.e., between the proc and endp directives), which means that storage for the table is allocated in a .code section. Clearly, the jump table does not contain any intentional executable instructions, and this is why the table is positioned after the ret instruction. This also means that the jump table is read-only; the processor will generate an exception on any write attempt to the table.

The assembly language function AvxPackedConvertFP_ begins its execution by validating the argument value cvt_op. The ensuing jmp [CvtOpTable+r9*8] instruction transfers control to a code block that performs the actual packed data conversion. During execution of this instruction, the processor loads register RIP with the contents of memory that’s specified by [CvxOpTable+r9*8]. In the current example, register R9 contains cvt_op and this value is used as an index into CvtOpTable.

The conversion code blocks in AvxPackedConvertFP_ use the aligned move instructions vmovaps, vmovapd, and vmovdqa to transfer packed operands to and from memory. Specific AVX conversion instructions carry out the requested operations. For example, the vcvtps2dq and vcvtdq2ps instructions perform conversions between packed single-precision floating-point and signed doubleword integer values and vice versa. When used with 128-bit wide operands, these instructions convert four values simultaneously. The counterpart double-precision instructions, vcvtpd2dq and vcvtdq2pd, are slightly different in that only two values are converted due to the element size differences (32 and 64 bits). The vcvtps2pd and vcvtpd2ps instructions perform their conversions in a similar manner. Note that the vcvtpd2dq and vcvtpd2ps instructions set the high-order 64 bits of the destination operand to zero. All of the AVX packed conversion instructions use the rounding mode that’s specified by the rounding control field MXCSR.RC, as described in Chapter 4. The default rounding mode for Visual C++ is round to nearest. Here is the output for example Ch06_03:
Results for CvtOp::I32_F32
a:        10      -500  |       600      -1024
b:    10.000000   -500.000000  |   600.000000  -1024.000000
Results for CvtOp::F32_I32
a:     0.333333    0.666667  |    -0.666667    -1.333333
b:        0        1  |       -1       -1
Results for CvtOp::F32_F64
a:     0.142857    0.222222  |    0.000000    0.000000
b:          0.142857149243  |         0.222222223878
Results for CvtOp::I32_F64
a:        10       -20  |        0        0
b:         10.000000000000  |        -20.000000000000
Results for CvtOp::F64_I32
a:          3.141592653590  |         2.718281828459
b:        3        3  |        0        0
Results for CvtOp::F64_F32
a:          1.414213562373  |         0.707106781187
b:     1.414214    0.707107  |    0.000000    0.000000

Packed Floating-Point Arrays

The computational resources of AVX are often employed to accelerate calculations using arrays of single-precision or double-precision floating-point values. In this section, you learn how to use packed arithmetic to process multiple elements of a floating-point array simultaneously. You also see examples of additional AVX instructions and learn how to perform runtime alignment checks of operands in memory.

Packed Floating-Point Square Roots

Listing 6-4 shows the code for example Ch06_04, which illustrates how to perform a simple packed arithmetic calculation using a single-precision floating-point array. It also explains how to perform a runtime check of an array’s address to ensure that its properly aligned in memory.
//------------------------------------------------
//        Ch06_04.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include <random>
using namespace std;
extern "C" bool AvxCalcSqrts_(float* y, const float* x, size_t n);
void Init(float* x, size_t n, unsigned int seed)
{
  uniform_int_distribution<> ui_dist {1, 2000};
  default_random_engine rng {seed};
  for (size_t i = 0; i < n; i++)
    x[i] = (float)ui_dist(rng);
}
bool AvxCalcSqrtsCpp(float* y, const float* x, size_t n)
{
  const size_t alignment = 16;
  if (n == 0)
    return false;
  if (((uintptr_t)x % alignment) != 0)
    return false;
  if (((uintptr_t)y % alignment) != 0)
    return false;
  for (size_t i = 0; i < n; i++)
    y[i] = sqrt(x[i]);
  return true;
}
int main()
{
  const size_t n = 19;
  alignas(16) float x[n];
  alignas(16) float y1[n];
  alignas(16) float y2[n];
  Init(x, n, 53);
  bool rc1 = AvxCalcSqrtsCpp(y1, x, n);
  bool rc2 = AvxCalcSqrts_(y2, x, n);
  cout << fixed << setprecision(4);
  cout << " Results for AvxCalcSqrts ";
  if (!rc1 || !rc2)
    cout << "Invalid return code ";
  else
  {
    const char* sp = "  ";
    for (size_t i = 0; i < n; i++)
    {
      cout << "i: " << setw(2) << i << sp;
      cout << "x: " << setw(9) << x[i] << sp;
      cout << "y1: " << setw(9) << y1[i] << sp;
      cout << "y2: " << setw(9) << y2[i] << ' ';
    }
  }
}
;-------------------------------------------------
;        Ch06_04.asm
;-------------------------------------------------
; extern "C" bool AvxCalcSqrts_(float* y, const float* x, size_t n);
    .code
AvxCalcSqrts_ proc
    xor eax,eax             ;set error return code (also array offset)
    test r8,r8
    jz Done               ;jump if n is zero
    test rcx,0fh
    jnz Done              ;jump if 'y' is not aligned
    test rdx,0fh
    jnz Done              ;jump if 'x' is not aligned
; Calculate packed square roots
    cmp r8,4
    jb FinalVals            ;jump if n < 4
@@:   vsqrtps xmm0,xmmword ptr [rdx+rax] ;calculate 4 square roots x[i+3:i]
    vmovaps xmmword ptr [rcx+rax],xmm0 ;save results to y[i+3:i]
    add rax,16             ;update offset to next set of values
    sub r8,4
    cmp r8,4              ;are there 4 or more elements remaining?
    jae @B               ;jump if yes
; Calculate square roots of final 1 - 3 values, note switch to scalar instructions
FinalVals:
    test r8,r8             ;more elements to process?
    jz SetRC              ;jump if no more elements
    vsqrtss xmm0,xmm0,real4 ptr [rdx+rax]  ;calculate sqrt(x[i])
    vmovss real4 ptr [rcx+rax],xmm0     ;save result to y[i]
    add rax,4
    dec r8
    jz SetRC
    vsqrtss xmm0,xmm0,real4 ptr [rdx+rax]
    vmovss real4 ptr [rcx+rax],xmm0
    add rax,4
    dec r8
    jz SetRC
    vsqrtss xmm0,xmm0,real4 ptr [rdx+rax]
    vmovss real4 ptr [rcx+rax],xmm0
SetRC: mov eax,1              ;set success return code
Done:  ret
AvxCalcSqrts_ endp
    end
Listing 6-4.

Example Ch06_04

The C++ code in Listing 6-4 includes a function named AvxCalcSqrtsCpp, which calculates y[i] = sqrt(x[i]). Before performing any of the required calculations, array size argument n is tested to make sure that’s not equal to zero. The pointers y and x are also tested to ensure that the respective arrays are properly aligned to a 16-byte boundary. An array is aligned to a 16-byte boundary if its address is evenly divisible by 16. The function returns an error code if any of these checks fail.

Assembly language function AvxCalcSqrts_ mimics the functionality of its C++ counterpart. The test r8,r8 and jz Done instructions ensure that the number of array elements n is greater than zero. The ensuing test rcx,0fh instruction checks array y for alignment to a 16-byte boundary. Recall that the test instruction performs a bitwise AND of its two operands and sets the status flags in RFLAGS according to the result (the actual result of the bitwise AND is discarded). If the test rcx,0fh instruction yields a non-zero value, array y is not aligned on a 16-byte boundary, and the function exits without performing any calculations. A similar test is used to ensure that array x is properly aligned.

The processing loop uses a vsqrtps instruction to calculate the required square roots. When used with 128-bit wide operands, this instruction calculates four single-precision floating-point square roots simultaneously. Using 128-bit wide operands means that the processing loop cannot execute a vsqrtps instruction if there are fewer than four element values remaining to be processed. Before performing any calculations using vsqrtps, R8 is checked to make sure that it’s greater than or equal to four. If R8 is less than four, the processing loop is skipped. The processing loop employs a vsqrtps xmm0,xmmword ptr [rdx+rax] instruction to calculate square roots of the four single-precision floating-point values located at the memory address specified by the source operand. It then stores the calculated square roots in register XMM0. A vmovaps xmmword ptr [rcx+rax],xmm0 instruction saves the four calculated square roots to y. Execution of the vsqrtps and vmovaps instructions continues until the number of elements remaining to be processed is less than four.

Following execution of the processing loop , the block of code starting at label FinalVals calculates the square roots for the final few values of array x. Note that the scalar AVX instructions vsqrtss and vmovss instructions perform these final (one, two, or three) calculations. Here is the output for source code example Ch06_04.
Results for AvxCalcSqrts
i:  0  x: 1354.0000  y1:  36.7967  y2:  36.7967
i:  1  x:  494.0000  y1:  22.2261  y2:  22.2261
i:  2  x: 1638.0000  y1:  40.4722  y2:  40.4722
i:  3  x:  278.0000  y1:  16.6733  y2:  16.6733
i:  4  x: 1004.0000  y1:  31.6860  y2:  31.6860
i:  5  x:  318.0000  y1:  17.8326  y2:  17.8326
i:  6  x: 1735.0000  y1:  41.6533  y2:  41.6533
i:  7  x: 1221.0000  y1:  34.9428  y2:  34.9428
i:  8  x:  544.0000  y1:  23.3238  y2:  23.3238
i:  9  x: 1568.0000  y1:  39.5980  y2:  39.5980
i: 10  x: 1633.0000  y1:  40.4104  y2:  40.4104
i: 11  x: 1577.0000  y1:  39.7115  y2:  39.7115
i: 12  x: 1659.0000  y1:  40.7308  y2:  40.7308
i: 13  x: 1565.0000  y1:  39.5601  y2:  39.5601
i: 14  x:  74.0000  y1:  8.6023  y2:  8.6023
i: 15  x: 1195.0000  y1:  34.5688  y2:  34.5688
i: 16  x:  406.0000  y1:  20.1494  y2:  20.1494
i: 17  x:  483.0000  y1:  21.9773  y2:  21.9773
i: 18  x: 1307.0000  y1:  36.1525  y2:  36.1525

The source code in Listing 6-4 can be easily adapted to process double-precision instead of single-precision floating-point values. In the C++ code, changing all float variables double is the only required modification. In the assembly language code, the vsqrtpd and vmovapd instructions must be used instead of vsqrtps and vmovaps. The counting variables in AvxCalcSqrts_ must also be changed to process two double-precision instead of four single-precision floating-point values per iteration.

Packed Floating-Point Array Min-Max

Listing 6-5 shows the source code for example Ch06_05. This example demonstrates how to compute the minimum and maximum value of a single-precision floating-point array using packed AVX instructions.
//------------------------------------------------
//        Ch06_05.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include <random>
#include <limits>
#include "AlignedMem.h"
using namespace std;
extern "C" float g_MinValInit = numeric_limits<float>::max();
extern "C" float g_MaxValInit = -numeric_limits<float>::max();
extern "C" bool CalcArrayMinMaxF32_(float* min_val, float* max_val, const float* x, size_t n);
void Init(float* x, size_t n, unsigned int seed)
{
  uniform_int_distribution<> ui_dist {1, 10000};
  default_random_engine rng {seed};
  for (size_t i = 0; i < n; i++)
    x[i] = (float)ui_dist(rng);
}
bool CalcArrayMinMaxF32Cpp(float* min_val, float* max_val, const float* x, size_t n)
{
  // Make sure x is properly aligned
  if (!AlignedMem::IsAligned(x, 16))
    return false;
  // Find array minimum and maximum values
  float min_val_temp = g_MinValInit;
  float max_val_temp = g_MaxValInit;
  if (n > 0)
  {
    for (size_t i = 0; i < n; i++)
    {
      if (x[i] < min_val_temp)
        min_val_temp = x[i];
      if (x[i] > max_val_temp)
        max_val_temp = x[i];
    }
  }
  *min_val = min_val_temp;
  *max_val = max_val_temp;
  return true;
}
int main()
{
  const size_t n = 31;
  alignas(16) float x[n];
  Init(x, n, 73);
  float min_val1, max_val1;
  float min_val2, max_val2;
  CalcArrayMinMaxF32Cpp(&min_val1, &max_val1, x, n);
  CalcArrayMinMaxF32_(&min_val2, &max_val2, x, n);
  cout << fixed << setprecision(1);
  cout << "-------------- Array x -------------- ";
  for (size_t i = 0; i < n; i++)
  {
    cout << "x[" << setw(2) << i << "]: " << setw(9) << x[i];
    if (i & 1)
      cout << ' ';
    else
      cout << "  ";
  }
  cout << ' ';
  cout << " Results for CalcArrayMinMaxF32Cpp ";
  cout << " min_val = " << setw(9) << min_val1 << ", ";
  cout << " max_val = " << setw(9) << max_val1 << ' ';
  cout << " Results for CalcArrayMinMaxF32_ ";
  cout << " min_val = " << setw(9) << min_val2 << ", ";
  cout << " max_val = " << setw(9) << max_val2 << ' ';
  return 0;
}
;-------------------------------------------------
;        Ch06_05.asm
;-------------------------------------------------
    extern g_MinValInit:real4
    extern g_MaxValInit:real4
; extern "C" bool CalcArrayMinMaxF32_(float* min_val, float* max_val, const float* x, size_t n)
    .code
CalcArrayMinMaxF32_ proc
; Validate arguments
    xor eax,eax             ;set error return code
    test r8,0fh             ;is x aligned to 16-byte boundary?
    jnz Done              ;jump if no
    vbroadcastss xmm4,real4 ptr [g_MinValInit]   ;xmm4 = min values
    vbroadcastss xmm5,real4 ptr [g_MaxValInit]   ;xmm5 = max values
    cmp r9,4
    jb FinalVals            ;jump if n < 4
; Main processing loop
@@:   vmovaps xmm0,xmmword ptr [r8]    ;load next set of array values
    vminps xmm4,xmm4,xmm0        ;update packed min values
    vmaxps xmm5,xmm5,xmm0        ;update packed max values
    add r8,16
    sub r9,4
    cmp r9,4
    jae @B
; Process the final 1 - 3 values of the input array
FinalVals:
    test r9,r9
    jz SaveResults
    vminss xmm4,xmm4,real4 ptr [r8]   ;update packed min values
    vmaxss xmm5,xmm5,real4 ptr [r8]   ;update packed max values
    dec r9
    jz SaveResults
    vminss xmm4,xmm4,real4 ptr [r8+4]
    vmaxss xmm5,xmm5,real4 ptr [r8+4]
    dec r9
    jz SaveResults
    vminss xmm4,xmm4,real4 ptr [r8+8]
    vmaxss xmm5,xmm5,real4 ptr [r8+8]
; Calculate and save final min & max values
SaveResults:
    vshufps xmm0,xmm4,xmm4,00001110b  ;xmm0[63:0] = xmm4[128:64]
    vminps xmm1,xmm0,xmm4        ;xmm1[63:0] contains final 2 values
    vshufps xmm2,xmm1,xmm1,00000001b  ;xmm2[31:0] = xmm1[63:32]
    vminps xmm3,xmm2,xmm1        ;xmm3[31:0] contains final value
    vmovss real4 ptr [rcx],xmm3     ;save array min value
    vshufps xmm0,xmm5,xmm5,00001110b
    vmaxps xmm1,xmm0,xmm5
    vshufps xmm2,xmm1,xmm1,00000001b
    vmaxps xmm3,xmm2,xmm1
    vmovss real4 ptr [rdx],xmm3     ;save array max value
    mov eax,1              ;set success return code
Done:  ret
CalcArrayMinMaxF32_ endp
    end
Listing 6-5.

Example Ch06_05

The structure of the C++ source code that’s shown in Listing 6-5 is similar to the previous array example. The function CalcArrayMinMaxF32Cpp uses a simple for loop to determine the array’s minimum and maximum values. Prior to the for loop, the template function AlignedMem::IsAligned verifies that source array x is properly aligned. You’ll learn more about class AlignedMem in Chapter 7. The initial minimum and maximum values are obtained from the global variables g_MinValInit and g_MaxValInit, which were initialized using the C++ template constant numeric_limits<float>::max(). Global variables are employed here to ensure that the functions CalcArrayMinMaxF32Cpp and CalcArrayMinMaxF32_ use the same initial values.

Upon entry to the assembly language function CalcArrayMinMaxF32_, the array x is tested for proper alignment. If array x is properly aligned, a vbroadcastss xmm4,real4 ptr [g_MinValInit] instruction initializes all four single-precision floating-point elements in register XMM4 with the value g_MinValInit. The subsequent vbroadcastss xmm5,real4 ptr [g_MaxValInit] instruction broadcasts g_MaxValInit to all four element positions in register XMM5.

Like the previous example, the processing loop in CalcArrayMinMaxF32_ examines four array elements during each iteration. The vminps xmm4,xmm4,xmm0 and vmaxps xmm5,xmm5,xmm0 instructions maintain intermediate packed minimum and maximum values in registers XMM4 and XMM5, respectively. The processing loop continues until there fewer than four elements remaining. The final elements in the array are tested using the scalar instructions vminss and vmaxss.

Subsequent to the execution of the vmaxss instruction that’s immediately above the label SaveResults, register XMM4 contains four single-precision floating-point values, and one of these values is the minimum for array x. A series of vshufps (Packed Interleave Shuffle Single-Precision Floating-Point Values) and vminps instructions is then used to determine the final minimum value. The vshufps xmm0,xmm4,xmm4,00001110b instruction copies the two high-order floating-point elements in register XMM4 to the low-order element positions in XMM0 (i.e., XMM0[63:0] = XMM4[127:64]). This instruction uses the bit values of its immediate operand as indices for selecting elements to copy.

The immediate operand that’s used by the vshufps instruction warrants further explanation. In the current example, bits 1:0 (10b) of the immediate operand instruct the processor to copy single-precision floating-point element #2 (XMM4[95:64]) from the first source operand to element position #0 (XMM0[31:0]) of the destination operand. Bits 3:2 (11b) of the immediate operand also instruct the processor to copy element #3 (XMM4[127:64]) of the first source operand to element position #1 (XMM0[63:32]) of the destination operand. Bits 7:6 and 5:4 of the immediate operand can be used to copy elements from the second source operand to element positions #2 (XMM0[95:64]) and #3 (XMM0[127:96]) of the destination operand, but they’re not needed in the current example. The vshufps instruction is followed by a vminps xmm1,xmm0,xmm4 instruction that yields the final two minimum values in XMM1[63:32] and XMM1[31:0]. Another sequence of vshufps and vminps instructions is then used to extract the final minimum value. Figure 6-3 illustrates this reduction process in greater detail.
../images/326959_2_En_6_Chapter/326959_2_En_6_Fig3_HTML.jpg
Figure 6-3.

Packed minimum reduction using vshufps and vminps instructions

Following calculation of the array’s minimum value, an analogous series of vshufps and vmaxps instructions determines the maximum value using the same reduction technique. Here are the results for example Ch06_05:
-------------- Array x --------------
x[ 0]:  2183.0  x[ 1]:  4547.0
x[ 2]:  9279.0  x[ 3]:  7291.0
x[ 4]:  5105.0  x[ 5]:  6505.0
x[ 6]:  4820.0  x[ 7]:   994.0
x[ 8]:  1559.0  x[ 9]:  3867.0
x[10]:  7272.0  x[11]:  9698.0
x[12]:  6181.0  x[13]:  4742.0
x[14]:  7279.0  x[15]:  1224.0
x[16]:  4840.0  x[17]:  8453.0
x[18]:  6876.0  x[19]:  1786.0
x[20]:  4022.0  x[21]:   911.0
x[22]:  6676.0  x[23]:  2979.0
x[24]:  4431.0  x[25]:  6133.0
x[26]:  7093.0  x[27]:  9892.0
x[28]:  9622.0  x[29]:  5058.0
x[30]:  1264.0
Results for CalcArrayMinMaxF32Cpp
 min_val =   911.0,  max_val =  9892.0
Results for CalcArrayMinMaxF32_
 min_val =   911.0,  max_val =  9892.0

Packed Floating-Point Least Squares

Source code example Ch06_06 details the calculation of a least squares regression line using packed double-precision floating-point arithmetic. Listing 6-6 shows the C++ and x86 assembly language source code for example Ch06_06.
//------------------------------------------------
//        Ch06_06.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include <cstddef>
#include "AlignedMem.h"
using namespace std;
extern "C" double LsEpsilon = 1.0e-12;
extern "C" bool AvxCalcLeastSquares_(const double* x, const double* y, int n, double* m, double* b);
bool AvxCalcLeastSquaresCpp(const double* x, const double* y, int n, double* m, double* b)
{
  if (n < 2)
    return false;
  if (!AlignedMem::IsAligned(x, 16) || !AlignedMem::IsAligned(y, 16))
    return false;
  double sum_x = 0, sum_y = 0.0, sum_xx = 0, sum_xy = 0.0;
  for (int i = 0; i < n; i++)
  {
    sum_x += x[i];
    sum_xx += x[i] * x[i];
    sum_xy += x[i] * y[i];
    sum_y += y[i];
  }
  double denom = n * sum_xx - sum_x * sum_x;
  if (fabs(denom) >= LsEpsilon)
  {
    *m = (n * sum_xy - sum_x * sum_y) / denom;
    *b = (sum_xx * sum_y - sum_x * sum_xy) / denom;
    return true;
  }
  else
  {
    *m = *b = 0.0;
    return false;
  }
}
int main()
{
  const int n = 11;
  alignas(16) double x[n] = {10, 13, 17, 19, 23, 7, 35, 51, 89, 92, 99};
  alignas(16) double y[n] = {1.2, 1.1, 1.8, 2.2, 1.9, 0.5, 3.1, 5.5, 8.4, 9.7, 10.4};
  double m1 = 0, m2 = 0;
  double b1 = 0, b2 = 0;
  bool rc1 = AvxCalcLeastSquaresCpp(x, y, n, &m1, &b1);
  bool rc2 = AvxCalcLeastSquares_(x, y, n, &m2, &b2);
  cout << fixed << setprecision(8);
  cout << " Results from AvxCalcLeastSquaresCpp ";
  cout << " rc:     " << setw(12) << boolalpha << rc1 << ' ';
  cout << " slope:   " << setw(12) << m1 << ' ';
  cout << " intercept:: " << setw(12) << b1 << ' ';
  cout << " Results from AvxCalcLeastSquares_ ";
  cout << " rc:     " << setw(12) << boolalpha << rc2 << ' ';
  cout << " slope:   " << setw(12) << m2 << ' ';
  cout << " intercept:: " << setw(12) << b2 << ' ';
  return 0;
}
;-------------------------------------------------
;        Ch06_06.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
    extern LsEpsilon:real8       ;global value defined in C++ file
; extern "C" bool AvxCalcLeastSquares_(const double* x, const double* y, int n, double* m, double* b);
;
; Returns    0 = error (invalid n or improperly aligned array), 1 = success
    .const
    align 16
AbsMaskF64 qword 7fffffffffffffffh, 7fffffffffffffffh ;mask for DPFP absolute value
    .code
AvxCalcLeastSquares_ proc frame
    _CreateFrame LS_,0,48,rbx
    _SaveXmmRegs xmm6,xmm7,xmm8
    _EndProlog
; Validate arguments
    xor eax,eax             ;set error return code
    cmp r8d,2
    jl Done               ;jump if n < 2
    test rcx,0fh
    jnz Done              ;jump if x not aligned to 16-byte boundary
    test rdx,0fh
    jnz Done              ;jump if y not aligned to 16-byte boundary
; Perform required initializations
    vcvtsi2sd xmm3,xmm3,r8d       ;xmm3 = n
    mov eax,r8d
    and r8d,0fffffffeh         ;rd8 = n / 2 * 2
    and eax,1              ;eax = n % 2
    vxorpd xmm4,xmm4,xmm4        ;sum_x (both qwords)
    vxorpd xmm5,xmm5,xmm5        ;sum_y (both qwords)
    vxorpd xmm6,xmm6,xmm6        ;sum_xx (both qwords)
    vxorpd xmm7,xmm7,xmm7        ;sum_xy (both qwords)
    xor ebx,ebx             ;rbx = array offset
    mov r10,[rbp+LS_OffsetStackArgs]  ;r10 = b
; Calculate sum variables. Note that two values are processed each iteration.
@@:   vmovapd xmm0,xmmword ptr [rcx+rbx] ;load next two x values
    vmovapd xmm1,xmmword ptr [rdx+rbx] ;load next two y values
    vaddpd xmm4,xmm4,xmm0        ;update sum_x
    vaddpd xmm5,xmm5,xmm1        ;update sum_y
    vmulpd xmm2,xmm0,xmm0        ;calc x * x
    vaddpd xmm6,xmm6,xmm2        ;update sum_xx
    vmulpd xmm2,xmm0,xmm1        ;calc x * y
    vaddpd xmm7,xmm7,xmm2        ;update sum_xy
    add rbx,16             ;rbx = next offset
    sub r8d,2              ;adjust counter
    jnz @B               ;repeat until done
; Update sum variables with the final x, y values if 'n' is odd
    or eax,eax
    jz CalcFinalSums          ;jump if n is even
    vmovsd xmm0,real8 ptr [rcx+rbx]   ;load final x
    vmovsd xmm1,real8 ptr [rdx+rbx]   ;load final y
    vaddsd xmm4,xmm4,xmm0        ;update sum_x
    vaddsd xmm5,xmm5,xmm1        ;update sum_y
    vmulsd xmm2,xmm0,xmm0        ;calc x * x
    vaddsd xmm6,xmm6,xmm2        ;update sum_xx
    vmulsd xmm2,xmm0,xmm1        ;calc x * y
    vaddsd xmm7,xmm7,xmm2        ;update sum_xy
; Calculate final sum_x, sum_y, sum_xx, sum_xy
CalcFinalSums:
    vhaddpd xmm4,xmm4,xmm4       ;xmm4[63:0] = final sum_x
    vhaddpd xmm5,xmm5,xmm5       ;xmm5[63:0] = final sum_y
    vhaddpd xmm6,xmm6,xmm6       ;xmm6[63:0] = final sum_xx
    vhaddpd xmm7,xmm7,xmm7       ;xmm7[63:0] = final sum_xy
; Compute denominator and make sure it's valid
; denom = n * sum_xx - sum_x * sum_x
    vmulsd xmm0,xmm3,xmm6        ;n * sum_xx
    vmulsd xmm1,xmm4,xmm4        ;sum_x * sum_x
    vsubsd xmm2,xmm0,xmm1        ;denom
    vandpd xmm8,xmm2,xmmword ptr [AbsMaskF64] ;fabs(denom)
    vcomisd xmm8,real8 ptr [LsEpsilon]
    jb BadDen              ;jump if denom < fabs(denom)
; Compute and save slope
; slope = (n * sum_xy - sum_x * sum_y) / denom
    vmulsd xmm0,xmm3,xmm7        ;n * sum_xy
    vmulsd xmm1,xmm4,xmm5        ;sum_x * sum_y
    vsubsd xmm2,xmm0,xmm1        ;slope numerator
    vdivsd xmm3,xmm2,xmm8        ;final slope
    vmovsd real8 ptr [r9],xmm3     ;save slope
; Compute and save intercept
; intercept = (sum_xx * sum_y - sum_x * sum_xy) / denom
    vmulsd xmm0,xmm6,xmm5        ;sum_xx * sum_y
    vmulsd xmm1,xmm4,xmm7        ;sum_x * sum_xy
    vsubsd xmm2,xmm0,xmm1        ;intercept numerator
    vdivsd xmm3,xmm2,xmm8        ;final intercept
    vmovsd real8 ptr [r10],xmm3     ;save intercept
    mov eax,1              ;success return code
    jmp Done
; Bad denominator detected, set m and b to 0.0
BadDen: vxorpd xmm0,xmm0,xmm0
    vmovsd real8 ptr [r9],xmm0     ;*m = 0.0
    vmovsd real8 ptr [r10],xmm0     ;*b = 0.0
    xor eax,eax             ;set error code
Done:  _RestoreXmmRegs xmm6,xmm7,xmm8
    _DeleteFrame rbx
    ret
AvxCalcLeastSquares_ endp
    end
Listing 6-6.

Example Ch06_06

Simple linear regression is a statistical technique that models a linear relationship between two variables. One popular method of simple linear regression is called least squares fitting, which uses a set of sample data points to determine a best fit or optimal curve between two variables. When used with a simple linear regression model, the curve is a straight line whose equation is y = mx + b. In this equation, x denotes the independent variable, y represents the dependent (or measured) variable, m is the line’s slope, and b is the line’s y-axis intercept point. The slope and intercept point of a least squares line are determined using a series of computations that minimize the sum of the squared deviations between the line and sample data points. Following calculation of its slope and intercept point, a least squares line is frequently used to predict an unknown y value using a known x value. If you’re interested in learning more about the theory of simple linear regression and least squares fitting, consult the references listed in Appendix A.

In sample program Ch06_06, the following equations are used to calculate the least squares slope and intercept point:
$$ m=frac{nsum limits_i{x}_i{y}_i-sum limits_i{x}_isum limits_i{y}_i}{nsum limits_i{x}_i^2-{left(sum limits_i{x}_i
ight)}^2} $$
>
$$ b=frac{sum limits_i{x}_i^2sum limits_i{y}_i-sum limits_i{x}_isum limits_i{x}_i{y}_i}{nsum limits_i{x}_i^2-{left(sum limits_i{x}_i
ight)}^2} $$
At first glance, the slope and intercept equations may appear a little daunting. However, upon closer examination, a couple of simplifications become apparent. First, the slope and intercept point denominators are the same, which means that this value only needs to be computed once. Second, it is only necessary to calculate four simple summation quantities (or sum variables), as shown in the following equations:
$$ sum\_x=sum limits_i{x}_i $$
$$ sum\_y=sum limits_i{y}_i $$
$$ sum\_ xy=sum limits_i{x}_i{y}_i $$
$$ sum\_ xx=sum limits_i{x}_i^2 $$

Subsequent to the calculation of the sum variables, the least-squares slope and intercept point are easily derived using straightforward multiplication, subtraction, and division.

The C++ source code in Listing 6-6 includes a function named AvxCalcLeastSquaresCpp that calculates a least-squares slope and intercept point for comparison purposes. AvxCalcLeastSquaresCpp uses AlignedMem::IsAligned() to validate proper alignment of the two data arrays. The C++ class AlignedMem (source code not shown but included in the download package) contains a few simple member functions that perform aligned memory management and validation. These functions have been incorporated into a C++ class to facilitate code reuse in this example and subsequent chapters. The C++ function main defines a couple of test arrays named x and y using the C++ specifier alignas(16), which instructs the compiler to align each of these arrays on a 16-byte boundary. The remainder of main contains code that exercises both the C++ and x86 assembly language implementations of the least squares algorithm and streams the results to cout.

The x86-64 assembly language code for function AvxCalcLeastSquares_ begins with saves of non-volatile registers RBX, XMM6, XMM7, and XMM8 using the macros _CreateFrame and _SaveXmmRegs. Argument value n is then validated for size, and the array pointers x and y are tested for proper alignment. Following validation of the function arguments, a series of initializations is performed. The vcvtsi2sd xmm3,xmm3,r8d instruction converts the value n to double-precision floating-point for later use. The value n in R8D is then rounded down to the nearest even number using an and r8d,0fffffffeh instruction and EAX is set to zero or one depending on whether the original value of n is even or odd. These adjustments are carried out to ensure proper processing of arrays x and y using packed arithmetic.

Recall from the discussions earlier in this section that in order to compute the slope and intercept point of a least squares regression line, you need to calculate four intermediate sum values: sum_x, sum_y, sum_xx, and sum_xy. The summation loop that calculates these values in AvxCalcLeastSquares_ uses packed double-precision floating-point arithmetic. This means that AvxCalcLeastSquares_ can process two elements from arrays x and y during each loop iteration, which halves the number of required iterations. The sum values for array elements with even-numbered indices are computed using the low-order quadwords of XMM4-XMM7, while the high-order quadwords are used to calculate the sum values for array elements with odd-numbered indices.

Prior to entering the summation loop, each sum value register is initialized to zero using a vxorpd instruction. At the top of the summation loop, a vmovapd xmm0,xmmword ptr [rcx+rbx] instruction copies x[i] and x[i+1] into the low-order and high-order quadwords of XMM0, respectively. The next instruction, vmovapd xmm1,xmmword ptr [rdx+rbx], loads y[i] and y[i+1] into the low-order and high-order quadwords of XMM1. A series of vaddpd and vmulpd instructions update the packed sum values that are maintained in XMM4 - XMM7. Array offset register RBX is then incremented by 16 (or the size of two double-precision floating-point values) and the count value in R8D is adjusted before the next summation loop iteration. Following completion of the summation loop, a check is made to determine if the original value of n was odd. If true, the final element of array x and array y must be added to the packed sum values. The AVX scalar instructions vaddsd and vmulsd carry out this operation.

Following computation of the packed sum values, a series of vhaddpd (Packed Double-FP Horizontal Add) instructions compute the final values of sum_x, sum_y, sum_xx, and sum_xy. Each vhaddpd DesOp, SrcOp1,SrcOp2 instruction computes DesOp[63:0] = SrcOp1[127:64] + SrcOp1[63:0] and DesOp[127:64] = SrcOp2[127:64] + SrcOp2[63:0] (see Figure 4-14). Subsequent to the execution of the vhaddpd instructions, the low-order quadwords of registers XMM4 - XMM7 contain the final sum values. The high-order quadwords of these registers also contain the final sum values , but this is a consequence of using the same register for both source operands. The value of denom is computed next and tested to make sure its absolute value is greater than or equal to LsEpsilon; an absolute value less than LsEpsilon is considered too close to zero to be valid. Note that a vandpd instruction is used to calculate fabs(denom). After validation of denom, the slope and intercept values are calculated using straightforward scalar arithmetic. Here is the output for source code example Ch06_06:
Results from AvxCalcLeastSquaresCpp
 rc:         true
 slope:    0.10324631
 intercept:: -0.10700632
Results from AvxCalcLeastSquares_
 rc:         true
 slope:    0.10324631
 intercept:: -0.10700632

Packed Floating-Point Matrices

Software applications such as computer graphics and computer-aided design programs often make extensive use of matrices. For example, three-dimensional (3D) computer graphics software typically employs matrices to perform common transformations such as translation, scaling, and rotation. When using homogeneous coordinates, each of these operations can be efficiently represented using a single 4 × 4 matrix. Multiple transformations can also be applied by merging a series of distinct transformation matrices into a single transformation matrix using matrix multiplication. This combined matrix is typically applied to an array of object vertices that defines a 3D model. It is important for 3D computer graphics software to carry out operations such as matrix multiplication and matrix-vector multiplication as quickly as possible since a 3D model may contain thousands or even millions of object vertices.

In this section, you learn how to perform matrix transposition and multiplication using 4 × 4 matrices and the AVX instruction set. You also learn more about assembly language macros, how to write macro code, and some simple techniques for benchmarking algorithm performance.

Matrix Transposition

The transpose of a matrix is calculated by interchanging its rows and columns. More formally, if A is an m × n matrix, the transpose of A (denoted here by B) is an n × m matrix, where b(i,j) = a(j,i). Figure 6-4 illustrates the transposition of a 4 × 4 matrix.
../images/326959_2_En_6_Chapter/326959_2_En_6_Fig4_HTML.jpg
Figure 6-4.

Transposition of a 4 × 4 matrix

Listing 6-7 shows the source code for example Ch06_07, which demonstrates how to transpose a 4 × 4 matrix of single-precision floating-point values .
//------------------------------------------------
//        Ch06_07.h
//------------------------------------------------
#pragma once
// Ch06_07_.asm
extern "C" void AvxMat4x4TransposeF32_(float* m_des, const float* m_src);
// Ch06_07_BM.cpp
extern void AvxMat4x4TransposeF32_BM(void);
//------------------------------------------------
//        Ch06_07.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include "Ch06_07.h"
#include "Matrix.h"
using namespace std;
void AvxMat4x4TransposeF32(Matrix<float>& m_src)
{
  const size_t nr = 4;
  const size_t nc = 4;
  Matrix<float> m_des1(nr ,nc);
  Matrix<float> m_des2(nr ,nc);
  Matrix<float>::Transpose(m_des1, m_src);
  AvxMat4x4TransposeF32_(m_des2.Data(), m_src.Data());
  cout << fixed << setprecision(1);
  m_src.SetOstream(12, " ");
  m_des1.SetOstream(12, " ");
  m_des2.SetOstream(12, " ");
  cout << "Results for AvxMat4x4TransposeF32 ";
  cout << "Matrix m_src " << m_src << ' ';
  cout << "Matrix m_des1 " << m_des1 << ' ';
  cout << "Matrix m_des2 " << m_des2 << ' ';
  if (m_des1 != m_des2)
    cout << " Matrix compare failed - AvxMat4x4TransposeF32 ";
}
int main()
{
  const size_t nr = 4;
  const size_t nc = 4;
  Matrix<float> m_src(nr ,nc);
  const float src_row0[] = { 2, 7, 8, 3 };
  const float src_row1[] = { 11, 14, 16, 10 };
  const float src_row2[] = { 24, 21, 27, 29 };
  const float src_row3[] = { 31, 34, 38, 33 };
  m_src.SetRow(0, src_row0);
  m_src.SetRow(1, src_row1);
  m_src.SetRow(2, src_row2);
  m_src.SetRow(3, src_row3);
  AvxMat4x4TransposeF32(m_src);
  AvxMat4x4TransposeF32_BM();
  return 0;
}
;-------------------------------------------------
;        Ch06_07.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
; _Mat4x4TransposeF32 macro
;
; Description: This macro transposes a 4x4 matrix of single-precision
;        floating-point values.
;
; Input Matrix          Output Matrix
; ---------------------------------------------------
; xmm0  a3 a2 a1 a0       xmm4  d0 c0 b0 a0
; xmm1  b3 b2 b1 b0       xmm5  d1 c1 b1 a1
; xmm2  c3 c2 c1 c0       xmm6  d2 c2 b2 a2
; xmm3  d3 d2 d1 d0       xmm7  d3 c3 b3 a3
_Mat4x4TransposeF32 macro
    vunpcklps xmm6,xmm0,xmm1      ;xmm6 = b1 a1 b0 a0
    vunpckhps xmm0,xmm0,xmm1      ;xmm0 = b3 a3 b2 a2
    vunpcklps xmm7,xmm2,xmm3      ;xmm7 = d1 c1 d0 c0
    vunpckhps xmm1,xmm2,xmm3      ;xmm1 = d3 c3 d2 c2
    vmovlhps xmm4,xmm6,xmm7       ;xmm4 = d0 c0 b0 a0
    vmovhlps xmm5,xmm7,xmm6       ;xmm5 = d1 c1 b1 a1
    vmovlhps xmm6,xmm0,xmm1       ;xmm6 = d2 c2 b2 a2
    vmovhlps xmm7,xmm1,xmm0       ;xmm7 = d3 c3 b2 a3
    endm
; extern "C" void AvxMat4x4TransposeF32_(float* m_des, const float* m_src)
    .code
AvxMat4x4TransposeF32_ proc frame
    _CreateFrame MT_,0,32
    _SaveXmmRegs xmm6,xmm7
    _EndProlog
; Transpose matrix m_src1
    vmovaps xmm0,[rdx]         ;xmm0 = m_src.row_0
    vmovaps xmm1,[rdx+16]        ;xmm1 = m_src.row_1
    vmovaps xmm2,[rdx+32]        ;xmm2 = m_src.row_2
    vmovaps xmm3,[rdx+48]        ;xmm3 = m_src.row_3
    _Mat4x4TransposeF32
    vmovaps [rcx],xmm4         ;save m_des.row_0
    vmovaps [rcx+16],xmm5        ;save m_des.row_1
    vmovaps [rcx+32],xmm6        ;save m_des.row_2
    vmovaps [rcx+48],xmm7        ;save m_des.row_3
Done:  _RestoreXmmRegs xmm6,xmm7
    _DeleteFrame
    ret
AvxMat4x4TransposeF32_ endp
    end
//------------------------------------------------
//        Ch06_07_BM.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <string>
#include "Ch06_07.h"
#include "Matrix.h"
#include "BmThreadTimer.h"
#include "OS.h"
using namespace std;
extern void AvxMat4x4TransposeF32_BM(void)
{
  OS::SetThreadAffinityMask();
  cout << " Running benchmark function AvxMat4x4TransposeF32_BM - please wait ";
  const size_t num_rows = 4;
  const size_t num_cols = 4;
  Matrix<float> m_src(num_rows, num_cols);
  Matrix<float> m_des1(num_rows, num_cols);
  Matrix<float> m_des2(num_rows, num_cols);
  const float m_src_r0[] = { 10, 11, 12, 13 };
  const float m_src_r1[] = { 14, 15, 16, 17 };
  const float m_src_r2[] = { 18, 19, 20, 21 };
  const float m_src_r3[] = { 22, 23, 24, 25 };
  m_src.SetRow(0, m_src_r0);
  m_src.SetRow(1, m_src_r1);
  m_src.SetRow(2, m_src_r2);
  m_src.SetRow(3, m_src_r3);
  const size_t num_it = 500;
  const size_t num_alg = 2;
  const size_t num_ops = 1000000;
  BmThreadTimer bmtt(num_it, num_alg);
  for (size_t i = 0; i < num_it; i++)
  {
    bmtt.Start(i, 0);
    for (size_t j = 0; j < num_ops; j++)
      Matrix<float>::Transpose(m_des1, m_src);
    bmtt.Stop(i, 0);
    bmtt.Start(i, 1);
    for (size_t j = 0; j < num_ops; j++)
      AvxMat4x4TransposeF32_(m_des2.Data(), m_src.Data());
    bmtt.Stop(i, 1);
  }
  string fn = bmtt.BuildCsvFilenameString("Ch06_07_AvxMat4x4TransposeF32_BM");
  bmtt.SaveElapsedTimes(fn, BmThreadTimer::EtUnit::MicroSec, 2);
  cout << "Benchmark times save to file " << fn << ' ';
}
Listing 6-7.

Example Ch06_07

The function main begins by instantiating a 4 × 4 single-precision floating-point test matrix named m_src using the C++ template Matrix. This template, which is defined in the header file Matrix.h (source code not shown), contains C++ code that implements a simple matrix class for test and benchmarking purposes. The internal buffer allocated by Matrix is aligned on a 64-byte boundary, which means that objects of type Matrix are properly aligned for use with AVX, AVX2, and AVX-512 instructions. The function main calls AvxMat4x4TransposeF32, which exercises the matrix transposition functions written in C++ and assembly language. The results of these transpositions are then streamed to cout. The function main also invokes a benchmarking function named AvxMat4x4TransposeF32_BM that measures the performance of each transposition function as explained later in this section.

Near the top of assembly language code is a macro named _Mat4x4TransposeF32. You learned in Chapter 5 that a macro is an assembler text substitution mechanism that allows a single text string to represent a sequence of assembly language instructions, data definitions, or other statements. During assembly of an x86 assembly language source code file, the assembler replaces any occurrence of the macro name with the statements that are declared between the macro and endm directives . Assembly language macros are typically employed to generate sequences of instructions that will be used more than once. Macros are also frequently used to avoid the performance overhead of a function call.

The macro _Mat4x4TransposeF32 contains AVX instructions that transpose a 4 × 4 matrix of single-precision floating-point values. This macro requires the rows of the source matrix to be loaded into registers XMM0 – XMM3 prior to its use. It then employs a series of vunpcklps, vunpckhps, vmovlhps, and vmovhlps instructions to transpose the source matrix, as illustrated in Figure 6-5. Following execution of these instructions, the transposed matrix is stored in registers XMM4–XMM7.
../images/326959_2_En_6_Chapter/326959_2_En_6_Fig5_HTML.jpg
Figure 6-5.

Instruction sequence used by _Mat4x4TransposeF32 to transpose a 4 × 4 matrix of single-precision floating-point values

The macro _Mat4x4TransposeF32 is used by the assembly language function AvxMat4x4Transpose4x4_. Immediately following its function prolog, function AvxMat4x4Transpose4x4_ executes a series of vmovaps instructions to load the source matrix into registers XMM0 – XMM3. Each XMM register contains one row of the source matrix. The macro _Mat4x4TransposeF32 is then employed to transpose the matrix. Figure 6-6 contains an excerpt from the MASM listing file that shows the macro expansion of _Mat4x4TransposeF32. This figure also shows the expansions of the prolog and epilog macros. The listing file symbolizes macro expanded instructions by placing a 1 in a column that’s located to the left of the mnemonic. Following calculation of the transpose, the resultant matrix is saved to the destination buffer using another series of vmovaps instructions.
../images/326959_2_En_6_Chapter/326959_2_En_6_Fig6_HTML.jpg
Figure 6-6.

Expansion of macro _Mat4x4TransposeF32

Here is the output for source code example Ch06_07:
Results for AvxMat4x4TransposeF32
Matrix m_src
     2.0      7.0      8.0      3.0
    11.0     14.0     16.0     10.0
    24.0     21.0     27.0     29.0
    31.0     34.0     38.0     33.0
Matrix m_des1
     2.0     11.0     24.0     31.0
     7.0     14.0     21.0     34.0
     8.0     16.0     27.0     38.0
     3.0     10.0     29.0     33.0
Matrix m_des2
     2.0     11.0     24.0     31.0
     7.0     14.0     21.0     34.0
     8.0     16.0     27.0     38.0
     3.0     10.0     29.0     33.0
Running benchmark function AvxMat4x4TransposeF32_BM - please wait
Benchmark times save to file Ch06_07_AvxMat4x4TransposeF32_BM_CHROMIUM.csv

Source code example Ch06_07 includes a function named AvxMat4x4TransposeF32_BM that contains code for measuring execution times of the C++ and assembly language matrix transposition functions. Most of the timing measurement code is encapsulated in a C++ class named BmThreadTimer . This class includes two member functions, BmThreadTimer::Start and BmThreadTimer::Stop, that implement a simple software stopwatch. Class BmThreadTimer also includes a member function named BmThreadTimer::SaveElapsedTimes, which saves the timing measurements to a comma-separated text file. AvxMat4x4Transpose_BM also uses a C++ class named OS . This class includes member functions that manage process and thread affinity. In the current example, OS::SetThreadAffinityMask selects a specific processor for benchmark thread execution. Doing this improves the accuracy of the timing measurements. The source code for classes BmThreadTimer and OS is not shown in Listing 6-7, but is included as part of the chapter download package.

Table 6-1 contains matrix transposition timing measurements using several different Intel processors. The measurements were made using an EXE file that was built with the Visual C++ Release configuration and the default settings for code optimization except for the following options: AVX code generation (/arch:AVX) was selected to facilitate “apples-to-apples” comparisons between the C++ and x86-64 assembly language code (the default code generation option for 64-bit Visual C++ is SSE2); whole program optimization was disabled. All timing measurements were made using ordinary desktop PCs running Windows 10. No attempt was made to account for any hardware, software, operating system, or configuration differences between the PCs prior to running the benchmark executable file . The test conditions described in this section are also used in subsequent chapters.
Table 6-1.

Matrix Transposition Mean Execution Times (Microseconds), 1,000,000 Transpositions

CPU

C++

Assembly Language

Intel Core i7-4790S

15885

2575

Intel Core i9-7900X

13381

2203

Intel Core i7-8700K

12216

1825

The values shown in Table 6-1 were computed using the CSV file execution times and the Excel spreadsheet function TRIMMEAN (array,0.10). The assembly language implementation of the matrix transposition algorithm clearly outperforms the C++ version by a wide margin. It is not uncommon to achieve significant speed improvements using x86 assembly language, especially by algorithms that can exploit the SIMD parallelism of an x86 processor. You’ll see additional examples of accelerated algorithmic performance throughout the remainder of this book.

The benchmark timing measurements cited in this book provide reasonable approximations of function execution times. Like automobile fuel economy and battery runtime estimates, software performance benchmarking is not an exact science and subject to a variety of pitfalls. It is also important to keep mind that this book is an introductory primer about x86-64 assembly language programming and not benchmarking. The source code examples are structured to hasten the study of a new programming language and not maximum performance. In addition, the Visual C++ options described earlier were selected mostly for practical reasons and may not yield optimal performance in all cases. Like many high-level compilers, Visual C++ includes a plethora of code generation and speed options that can affect performance. Benchmark timing measurements should always be construed in a context that’s correlated with the software’s purpose. The methods described in this section are generally worthwhile, but results can vary.

Matrix Multiplication

The product of two matrices is defined as follows. Let A be an m × n matrix where m and n denote the number of rows and columns, respectively. Let B be an n × p matrix. Let C be the product of A and B, which is an m × p matrix. The value of each element c(i, j) in C can be calculated using the following equation:
$$ {c}_{ij}=sum limits_{k=0}^{n-1}{a}_{ik}{b}_{kj}kern0.48em i=0,dots, m-1;j=0,dots, p-1 $$

Before proceeding to the sample code, a few comments are warranted. According to the definition of matrix multiplication, the number of columns in A must equal the number of rows in B. For example, if A is a 3 × 4 matrix and B is a 4 × 2 matrix, the product AB (a 3 × 2 matrix) can be calculated but the product BA is undefined. Note that the value of each c(i, j) in C is simply the dot product of row i in matrix A and column j in matrix B. The assembly language code will exploit this fact to perform matrix multiplications using packed AVX instructions. Also note that unlike most mathematical texts, the subscripts in the matrix multiplication equation use zero-based indexing. This simplifies translating the equation into C++ and assembly language code.

Listing 6-8 shows the source code for example Ch06_08. This example demonstrates how to perform matrix multiplication using two 4 × 4 matrices of single-precision floating-point values. Similar to the previous example, main calls a function named AvxMat4x4MulF32 that exercises a matrix multiplication test case using functions written in C++ and assembly language. The template member function Matrix<float>::Mul (source code not shown) carries out C++ matrix multiplication using the previously described equation. The assembly language function AvxMat4x4MulF32_ uses SIMD arithmetic to perform matrix multiplication as you’ll soon see.
//------------------------------------------------
//        Ch06_08.h
//------------------------------------------------
#pragma once
// Ch06_08_.asm
extern "C" void AvxMat4x4MulF32_(float* m_des, const float* m_src1, const float* m_src2);
// Ch06_08_BM.cpp
extern void AvxMat4x4MulF32_BM(void);
//------------------------------------------------
//        Ch06_08.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include "Ch06_08.h"
#include "Matrix.h"
using namespace std;
void AvxMat4x4MulF32(Matrix<float>& m_src1, Matrix<float>& m_src2)
{
  const size_t nr = 4;
  const size_t nc = 4;
  Matrix<float> m_des1(nr ,nc);
  Matrix<float> m_des2(nr ,nc);
  Matrix<float>::Mul(m_des1, m_src1, m_src2);
  AvxMat4x4MulF32_(m_des2.Data(), m_src1.Data(), m_src2.Data());
  cout << fixed << setprecision(1);
  m_src1.SetOstream(12, " ");
  m_src2.SetOstream(12, " ");
  m_des1.SetOstream(12, " ");
  m_des2.SetOstream(12, " ");
  cout << " Results for AvxMat4x4MulF32 ";
  cout << "Matrix m_src1 " << m_src1 << ' ';
  cout << "Matrix m_src2 " << m_src2 << ' ';
  cout << "Matrix m_des1 " << m_des1 << ' ';
  cout << "Matrix m_des2 " << m_des2 << ' ';
  if (m_des1 != m_des2)
    cout << " Matrix compare failed - AvxMat4x4MulF32 ";
}
int main()
{
  const size_t nr = 4;
  const size_t nc = 4;
  Matrix<float> m_src1(nr ,nc);
  Matrix<float> m_src2(nr ,nc);
  const float src1_row0[] = { 10, 11, 12, 13 };
  const float src1_row1[] = { 20, 21, 22, 23 };
  const float src1_row2[] = { 30, 31, 32, 33 };
  const float src1_row3[] = { 40, 41, 42, 43 };
  const float src2_row0[] = { 100, 101, 102, 103 };
  const float src2_row1[] = { 200, 201, 202, 203 };
  const float src2_row2[] = { 300, 301, 302, 303 };
  const float src2_row3[] = { 400, 401, 402, 403 };
  m_src1.SetRow(0, src1_row0);
  m_src1.SetRow(1, src1_row1);
  m_src1.SetRow(2, src1_row2);
  m_src1.SetRow(3, src1_row3);
  m_src2.SetRow(0, src2_row0);
  m_src2.SetRow(1, src2_row1);
  m_src2.SetRow(2, src2_row2);
  m_src2.SetRow(3, src2_row3);
  AvxMat4x4MulF32(m_src1, m_src2);
  AvxMat4x4MulF32_BM();
  return 0;
}
;-------------------------------------------------
;        Ch06_08.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
; _Mat4x4MulCalcRowF32 macro
;
; Description: This macro is used to compute one row of a 4x4 matrix
;        multiply.
;
; Registers:  xmm0 = m_src2.row0
;        xmm1 = m_src2.row1
;        xmm2 = m_src2.row2
;        xmm3 = m_src2.row3
;        rcx = m_des ptr
;        rdx = m_src1 ptr
;        xmm4 - xmm7 = scratch registers
_Mat4x4MulCalcRowF32 macro disp
    vbroadcastss xmm4,real4 ptr [rdx+disp]   ;broadcast m_src1[i][0]
    vbroadcastss xmm5,real4 ptr [rdx+disp+4]  ;broadcast m_src1[i][1]
    vbroadcastss xmm6,real4 ptr [rdx+disp+8]  ;broadcast m_src1[i][2]
    vbroadcastss xmm7,real4 ptr [rdx+disp+12]  ;broadcast m_src1[i][3]
    vmulps xmm4,xmm4,xmm0            ;m_src1[i][0] * m_src2.row_0
    vmulps xmm5,xmm5,xmm1            ;m_src1[i][1] * m_src2.row_1
    vmulps xmm6,xmm6,xmm2            ;m_src1[i][2] * m_src2.row_2
    vmulps xmm7,xmm7,xmm3            ;m_src1[i][3] * m_src2.row_3
    vaddps xmm4,xmm4,xmm5            ;calc m_des.row_i
    vaddps xmm6,xmm6,xmm7
    vaddps xmm4,xmm4,xmm6
    vmovaps[rcx+disp],xmm4           ;save m_des.row_i
    endm
; extern "C" void AvxMat4x4MulF32_(float* m_des, const float* m_src1, const float* m_src2)
;
; Description: The following function computes the product of two
;        single-precision floating-point 4x4 matrices.
    .code
AvxMat4x4MulF32_ proc frame
    _CreateFrame MM_,0,32
    _SaveXmmRegs xmm6,xmm7
    _EndProlog
; Compute matrix product m_des = m_src1 * m_src2
    vmovaps xmm0,[r8]          ;xmm0 = m_src2.row_0
    vmovaps xmm1,[r8+16]        ;xmm1 = m_src2.row_1
    vmovaps xmm2,[r8+32]        ;xmm2 = m_src2.row_2
    vmovaps xmm3,[r8+48]        ;xmm3 = m_src2.row_3
    _Mat4x4MulCalcRowF32 0       ;calculate m_des.row_0
    _Mat4x4MulCalcRowF32 16       ;calculate m_des.row_1
    _Mat4x4MulCalcRowF32 32       ;calculate m_des.row_2
    _Mat4x4MulCalcRowF32 48       ;calculate m_des.row_3
Done:  _RestoreXmmRegs xmm6,xmm7
    _DeleteFrame
    ret
AvxMat4x4MulF32_ endp
    end
//------------------------------------------------
//        Ch06_08_BM.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include "Ch06_08.h"
#include "Matrix.h"
#include "BmThreadTimer.h"
#include "OS.h"
using namespace std;
void AvxMat4x4MulF32_BM(void)
{
  OS::SetThreadAffinityMask();
  cout << " Running benchmark function AvxMat4x4MulF32_BM - please wait ";
  const size_t num_rows = 4;
  const size_t num_cols = 4;
  Matrix<float> m_src1(num_rows, num_cols);
  Matrix<float> m_src2(num_rows, num_cols);
  Matrix<float> m_des1(num_rows, num_cols);
  Matrix<float> m_des2(num_rows, num_cols);
  const float m_src1_r0[] = { 10, 11, 12, 13 };
  const float m_src1_r1[] = { 14, 15, 16, 17 };
  const float m_src1_r2[] = { 18, 19, 20, 21 };
  const float m_src1_r3[] = { 22, 23, 24, 25 };
  const float m_src2_r0[] = { 0, 1, 2, 3 };
  const float m_src2_r1[] = { 4, 5, 6, 7 };
  const float m_src2_r2[] = { 8, 9, 10, 11 };
  const float m_src2_r3[] = { 12, 13, 14, 15 };
  m_src1.SetRow(0, m_src1_r0);
  m_src1.SetRow(1, m_src1_r1);
  m_src1.SetRow(2, m_src1_r2);
  m_src1.SetRow(3, m_src1_r3);
  m_src2.SetRow(0, m_src2_r0);
  m_src2.SetRow(1, m_src2_r1);
  m_src2.SetRow(2, m_src2_r2);
  m_src2.SetRow(3, m_src2_r3);
  const size_t num_it = 500;
  const size_t num_alg = 2;
  const size_t num_ops = 1000000;
  BmThreadTimer bmtt(num_it, num_alg);
  for (size_t i = 0; i < num_it; i++)
  {
    bmtt.Start(i, 0);
    for (size_t j = 0; j < num_ops; j++)
      Matrix<float>::Mul(m_des1, m_src1, m_src2);
    bmtt.Stop(i, 0);
    bmtt.Start(i, 1);
    for (size_t j = 0; j < num_ops; j++)
      AvxMat4x4MulF32_(m_des2.Data(), m_src1.Data(), m_src2.Data());
    bmtt.Stop(i, 1);
  }
  string fn = bmtt.BuildCsvFilenameString("Ch06_08_AvxMat4x4MulF32_BM");
  bmtt.SaveElapsedTimes(fn, BmThreadTimer::EtUnit::MicroSec, 2);
  cout << "Benchmark times save to file " << fn << ' ';
}
Listing 6-8.

Example Ch06_08

The standard technique for performing matrix multiplication requires three nested for loops that employ scalar floating-point multiplication and addition (see the code for Matrix<T>::Mul in the header file Matrix.h). Figure 6-7 shows the explicit equations that can be used to calculate the elements of row 0 for the matrix product C = AB. Note that each row of matrix B is multiplied by the same element from matrix A. Similar sets of equations can be used to calculate rows 1, 2, and 3 of matrix C. The assembly language code in function AvxMatMul4x4F32_ uses these equations to carry out matrix multiplication using SIMD arithmetic.
../images/326959_2_En_6_Chapter/326959_2_En_6_Fig7_HTML.jpg
Figure 6-7.

Equations for first row of matrix C = AB

Following its prolog , AvxMatMul4x4F32_ loads matrix m_src2 (or B) into registers XMM0–XMM3. The next four lines use the macro _Mat4x4MulCalcRowF32 to calculate the products for rows 0–3 of m_des (or C). This macro implements the four equations that are shown in Figure 6-7. The macro parameter disp specifies which row to use. Macro _Mat4x4MulCalcRowF32 uses four vbroadcastss instructions to load the required elements from matrix m_src1 (or A) into registers XMM4–XMM7. It then uses four vmulps instructions to multiply these values by an entire row from matrix m_src2. A series of vaddps instructions computes the final element values for the row. The vmovaps [rcx+disp],xmm4 instruction saves the entire row to the specified destination buffer. Here is the output for example Ch06_08:
Results for AvxMat4x4MulF32
Matrix m_src1
    10.0     11.0     12.0     13.0
    20.0     21.0     22.0     23.0
    30.0     31.0     32.0     33.0
    40.0     41.0     42.0     43.0
Matrix m_src2
    100.0     101.0     102.0     103.0
    200.0     201.0     202.0     203.0
    300.0     301.0     302.0     303.0
    400.0     401.0     402.0     403.0
Matrix m_des1
   12000.0    12046.0    12092.0    12138.0
   22000.0    22086.0    22172.0    22258.0
   32000.0    32126.0    32252.0    32378.0
   42000.0    42166.0    42332.0    42498.0
Matrix m_des2
   12000.0    12046.0    12092.0    12138.0
   22000.0    22086.0    22172.0    22258.0
   32000.0    32126.0    32252.0    32378.0
   42000.0    42166.0    42332.0    42498.0
Running benchmark function AvxMat4x4MulF32_BM - please wait
Benchmark times save to file Ch06_08_AvxMat4x4MulF32_BM_CHROMIUM.csv
Source code example Ch06_08 also includes a function named AvxMat4x4MulF32_BM that performs benchmark timing measurements of the matrix multiplication functions. Table 6-2 shows the timing measurements for several different Intel processors . These measurements were made using the procedure described in the previous section.
Table 6-2.

Matrix Multiplication Mean Execution Times (Microseconds), 1,000,000 Multiplications

CPU

C++

Assembly Language

Intel Core i7-4790S

55195

5333

Intel Core i9-7900X

46008

4897

Intel Core i7-8700K

42260

4493

Summary

Here are the key learning points for Chapter 6:
  • The vaddp[d|s], vsubp[d|s], vmulp[d|s], vdivp[d|s], and vsqrtp[d|s] instructions carry out common arithmetic operation using packed double-precision and packed single-precision floating-point operands.

  • The vcvtp[d|s]2dq and vcvtdq2p[d|s] instructions perform conversions between packed floating-point and packed signed-doubleword operands. The vcvtps2pd and vcvtpd2ps perform conversions between packed single-precision and double-precision operands.

  • The vminp[d|s] and vmaxp[d|s] instructions perform packed minimum and maximum value calculations using double-precision and single-precision floating-point operands.

  • The vbroadcasts[d|s] instructions broadcast (or copy) a single scalar double-precision or single-precision value to all element positions of an x86 SIMD register.

  • Assembly language functions that use the vmovap[d|s] and vmovdqa instructions can only be used with operands in memory that are properly aligned. The MASM align 16 directive aligns data items in a .const or .data section to a 16-byte boundary. C++ functions can use the alignas specifier to guarantee proper alignment.

  • Assembly language functions can use the vunpck[h|l]p[d|s] instructions to accelerate common matrix operations, especially 4 × 4 matrices.

  • Assembly language functions can use the vhaddp[d|s] and vshufp[d|s] instructions to perform data reductions of intermediate packed values.

  • Many algorithms can achieve significant performance gains by using SIMD programming techniques and the x86-AVX instruction set.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
18.223.196.59