Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

D. KusswurmModern Parallel Programming with C++ and Assembly Languagehttps://doi.org/10.1007/978-1-4842-7918-2_2

2. AVX C++ Programming: Part 1

Daniel Kusswurm¹

(1)

Geneva, IL, USA

The previous chapter explained SIMD fundamentals including packed types, arithmetic calculations, and data manipulation operations. It also highlighted a few details regarding the history of x86-AVX and its computational capabilities. The focus of this chapter is AVX integer arithmetic using 128-bit wide operands. The first section contains several concise source code examples that illustrate how to use C++ SIMD intrinsic functions to perform packed integer arithmetic. This followed by a section that highlights common programming operations with packed integers including bitwise logical operations and shifts. The third and final section includes source code examples that demonstrate elementary image processing tasks using C++ SIMD intrinsic functions. As you will soon see, SIMD techniques are ideal for many types of image processing algorithms.

Integer Arithmetic

In this section, you will learn the basics of x86-AVX packed integer arithmetic using 128-bit wide SIMD operands. It begins with a simple program that demonstrates packed integer addition using both wraparound and saturated arithmetic. This is followed by a similar program that focuses on packed integer subtraction. The final source code example of this section details packed integer multiplication.

Note

Most of the source code examples is this book are shown using a single listing. This is done to minimize the number of listing references in the main text. The actual source code is partitioned into separate files using the naming conventions described in Chapter 1.

Integer Addition

The first source code example is named Ch02_01. This example explains how to perform packed integer addition using C++ SIMD intrinsic functions. It also illustrates proper use of commonly used C++ SIMD data types. Listing 2-1 shows the source code for example Ch02_01.

//------------------------------------------------

// XmmVal.h

//------------------------------------------------

#pragma once

#include <string>

#include <cstdint>

#include <sstream>

#include <iomanip>

struct alignas(16) XmmVal

{

public:

union

{

int8_t m_I8[16];

int16_t m_I16[8];

int32_t m_I32[4];

int64_t m_I64[2];

uint8_t m_U8[16];

uint16_t m_U16[8];

uint32_t m_U32[4];

uint64_t m_U64[2];

float m_F32[4];

double m_F64[2];

};

// rest of file XmmVal.h ...

//------------------------------------------------

// Ch02_01.h

//------------------------------------------------

#pragma once

#include "XmmVal.h"

// Ch02_01_fcpp.cpp

extern void AddI16_Iavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b);

extern void AddU16_Iavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b);

//------------------------------------------------

// Ch02_01.cpp

//------------------------------------------------

#include <iostream>

#include "Ch02_01.h"

static void AddI16(void);

static void AddU16(void);

int main()

{

AddI16();

AddU16();

return 0;

}

static void AddI16(void)

{

const char nl = ' ';

XmmVal a, b, c1, c2;

a.m_I16[0] = 10; b.m_I16[0] = 100;

a.m_I16[1] = 200; b.m_I16[1] = -200;

a.m_I16[2] = 30; b.m_I16[2] = 32760;

a.m_I16[3] = -32766; b.m_I16[3] = -400;

a.m_I16[4] = 50; b.m_I16[4] = 500;

a.m_I16[5] = 60; b.m_I16[5] = -600;

a.m_I16[6] = 32000; b.m_I16[6] = 1200;

a.m_I16[7] = -32000; b.m_I16[7] = -950;

AddI16_Iavx(&c1, &c2, &a, &b);

std::cout << " Results for AddI16_Iavx - Wraparound Addition ";

std::cout << "a: " << a.ToStringI16() << nl;

std::cout << "b: " << b.ToStringI16() << nl;

std::cout << "c1: " << c1.ToStringI16() << nl;

std::cout << " Results for AddI16_Iavx - Saturated Addition ";

std::cout << "a: " << a.ToStringI16() << nl;

std::cout << "b: " << b.ToStringI16() << nl;

std::cout << "c2: " << c2.ToStringI16() << nl;

}

static void AddU16(void)

{

const char nl = ' ';

XmmVal a, b, c1, c2;

a.m_U16[0] = 10; b.m_U16[0] = 100;

a.m_U16[1] = 200; b.m_U16[1] = 200;

a.m_U16[2] = 300; b.m_U16[2] = 65530;

a.m_U16[3] = 32766; b.m_U16[3] = 40000;

a.m_U16[4] = 50; b.m_U16[4] = 500;

a.m_U16[5] = 20000; b.m_U16[5] = 25000;

a.m_U16[6] = 32000; b.m_U16[6] = 1200;

a.m_U16[7] = 32000; b.m_U16[7] = 50000;

AddU16_Iavx(&c1, &c2, &a, &b);

std::cout << " Results for AddU16_Iavx - Wraparound Addition ";

std::cout << "a: " << a.ToStringU16() << nl;

std::cout << "b: " << b.ToStringU16() << nl;

std::cout << "c1: " << c1.ToStringU16() << nl;

std::cout << " Results for AddU16_Iavx - Saturated Addition ";

std::cout << "a: " << a.ToStringU16() << nl;

std::cout << "b: " << b.ToStringU16() << nl;

std::cout << "c2: " << c2.ToStringU16() << nl;

}

//------------------------------------------------

// Ch02_01_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch02_01.h"

void AddI16_Iavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c1_vals = _mm_add_epi16(a_vals, b_vals);

__m128i c2_vals = _mm_adds_epi16 (a_vals, b_vals);

_mm_store_si128 ((__m128i*)c1, c1_vals);

_mm_store_si128 ((__m128i*)c2, c2_vals);

}

void AddU16_Iavx(XmmVal* c1, XmmVal* c2, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c1_vals = _mm_add_epi16 (a_vals, b_vals);

__m128i c2_vals = _mm_adds_epu16 (a_vals, b_vals);

_mm_store_si128 ((__m128i*)c1, c1_vals);

_mm_store_si128 ((__m128i*)c2, c2_vals);

}

Listing 2-1

Example Ch02_01

Listing 2-1 begins with the declaration of a C++ structure named XmmVal , which is declared in the header file XmmVal.h. This structure contains a publicly accessible anonymous union whose members correspond to the packed data types that can be used with a 128-bit wide x86-AVX operand. Note that XmmVal is declared using the alignas(16) specifier. This specifier instructs the C++ compiler to align each instance of an XmmVal on a 16-byte boundary. When an x86 processor executes an x86-AVX instruction that references an operand in memory, maximum performance is achieved when the operand is aligned on its natural boundary (e.g., 16-, 32-, or 64-byte boundaries for 128-, 256-, or 512-bit wide data types, respectively). Some x86-AVX instructions require their operands to be properly aligned, and these instructions will raise an exception if they attempt to access a misaligned operand in memory. You will learn more about this later. The structure XmmVal also contains several member functions that format the contents of an XmmVal variable for streaming to std::cout. The source code for these member functions is not shown in Listing 2-1 but is included in the software download package. Structure XmmVal is used in this example and in later source code examples to demonstrate x86-AVX SIMD operations.

The next file in Listing 2-1, Ch02_01.h, incorporates the requisite C++ function declarations for this source code example. Note that the function declarations in this header file use the previously defined XmmVal structure. Also note that file Ch02_01.h begins with a short comment block that includes its name to make it identifiable in the listing.

The file Ch02_01.cpp is next. This file contains the function main() and two other static functions named AddI16() and AddU16(). Function AddI16() begins its execution by initializing two XmmVal variables with packed 16-bit signed integer data. This is followed by a call to the function AddI16_Iavx(), which performs packed 16-bit signed integer addition. The remaining code in AddI16() displays the results calculated by AddI16_Iavx(). The function AddU16() is almost identical to AddI16() except that it uses unsigned instead of signed integers.

The final file in Listing 2-1 is Ch02_01_fcpp.cpp. This file contains two SIMD calculating functions named AddI16_Iavx() and AddU16_Iavx(). Near the top of this file is an #include statement for the header file immintrin.h. This file contains the declarations for the C++ SIMD intrinsic functions that are used in Ch02_01_fcpp.cpp. Function AddI16_Iavx() begins its execution with a call to _mm_load_si128 (). This C++ SIMD intrinsic function loads the contents of argument a into a_vals. Note that a_vals is declared as an __m128i, which a 128-bit wide C++ SIMD intrinsic type of 8-, 16-, 32-, or 64-bit integers. The _mm_load_si128 () function requires its source operand to be properly aligned on a 16-byte boundary. This requirement is satisfied by the alignas(16) specifier that was used in the declaration of XmmVal . Another call to _mm_load_si128 () is then employed to initialize b_vals.

Following SIMD variable initialization, AddI16_Iavx() employs the C++ SIMD intrinsic function _mm_add_epi16 (), which performs packed 16-bit integer addition using operands a_vals and b_vals. The result of this addition is saved in an __m128i variable named c1_vals. The ensuing call to _mm_adds_epi16 () also performs packed 16-bit integer addition but carries out its calculations using saturated instead of wraparound arithmetic. The final two code statements of AddI16_Iavx() employ the C++ SIMD intrinsic function _mm_store_si128 (). This function saves a 128-bit wide packed integer value to the specified target buffer, which must be aligned on 16-byte boundary.

Function AddU16_Iavx() performs its calculations using packed 16-bit unsigned integers. This function is identical to AddI16_Iavx() except for the use of _mm_adds_epu16 (), which performs 16-bit unsigned integer addition using saturated arithmetic. Here are the results for source code example Ch02_01:

Results for AddI16_Iavx - Wraparound Addition

a: 10 200 30 -32766 | 50 60 32000 -32000

b: 100 -200 32760 -400 | 500 -600 1200 -950

c1: 110 0 -32746 32370 | 550 -540 -32336 32586

Results for AddI16_Iavx - Saturated Addition

a: 10 200 30 -32766 | 50 60 32000 -32000

b: 100 -200 32760 -400 | 500 -600 1200 -950

c2: 110 0 32767 -32768 | 550 -540 32767 -32768

Results for AddU16_Iavx - Wraparound Addition

a: 10 200 300 32766 | 50 20000 32000 32000

b: 100 200 65530 40000 | 500 25000 1200 50000

c1: 110 400 294 7230 | 550 45000 33200 16464

Results for AddU16_Iavx - Saturated Addition

a: 10 200 300 32766 | 50 20000 32000 32000

b: 100 200 65530 40000 | 500 25000 1200 50000

c2: 110 400 65535 65535 | 550 45000 33200 65535

Other C++ SIMD intrinsic functions are available for packed integer addition. You can use the C++ intrinsic function _mm_add_epi8(), _mm_add_epi32(), or _mm_add_epi64() to perform packed addition using 8-, 32-, or 64-bit wide signed or unsigned integers. You can also use the function _mm_adds_epi8() or _mm_adds_epu8() to carry out packed saturated addition using 8-bit signed or unsigned integers. Note that distinct C++ SIMD intrinsic functions are used for wraparound and saturated integer addition since these operations can generate different results as explained in Chapter 1.

Integer Subtraction

The next source code example, named Ch02_02, illustrates packed SIMD subtraction using 32- and 64-bit signed integers. Listing 2-2 shows the source code for example Ch02_02.

//------------------------------------------------

// Ch02_02.h

//------------------------------------------------

#pragma once

#include "XmmVal.h"

// Ch02_02_fcpp.cpp

extern void SubI32_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b);

extern void SubI64_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b);

//------------------------------------------------

// Ch02_02.cpp

//------------------------------------------------

#include <iostream>

#include "Ch02_02.h"

static void SubI32(void);

static void SubI64(void);

int main()

{

SubI32();

SubI64();

return 0;

}

static void SubI32(void)

{

XmmVal a, b, c;

const char nl = ' ';

a.m_I32[0] = 1000000; b.m_I32[0] = 100;

a.m_I32[1] = 200; b.m_I32[1] = -200;

a.m_I32[2] = -30; b.m_I32[2] = 30000;

a.m_I32[3] = 40000000; b.m_I32[3] = 5000;

SubI32_Iavx(&c, &a, &b);

std::cout << " Results for SubI32_Iavx ";

std::cout << "a: " << a.ToStringI32() << nl;

std::cout << "b: " << b.ToStringI32() << nl;

std::cout << "c: " << c.ToStringI32() << nl;

}

static void SubI64(void)

{

XmmVal a, b, c;

const char nl = ' ';

a.m_I64[0] = 100000000000; b.m_I64[0] = 99;

a.m_I64[1] = 200; b.m_I64[1] = 300000000000;

SubI64_Iavx(&c, &a, &b);

std::cout << " Results for SubI64_Iavx ";

std::cout << "a: " << a.ToStringI64() << nl;

std::cout << "b: " << b.ToStringI64() << nl;

std::cout << "c: " << c.ToStringI64() << nl;

}

//------------------------------------------------

// Ch02_02_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch02_02.h"

extern void SubI32_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c_vals = _mm_sub_epi32 (a_vals, b_vals);

_mm_store_si128 ((__m128i*)c, c_vals);

}

extern void SubI64_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c_vals = _mm_sub_epi64 (a_vals, b_vals);

_mm_store_si128 ((__m128i*)c, c_vals);

}

Listing 2-2

Example Ch02_02

The organization of the source code in example Ch02_02 parallels the previous example. In the file Ch02_02.cpp, the function SubI32() initializes XmmVal variables a and b using 32-bit signed integers. It then calls SubI32_Iavx(), which performs the packed subtraction. Function SubI64() is akin to SubI32() but uses 64-bit signed integers.

Function SubI32_Iavx() begins its execution with two _mm_load_si128 () calls that initialize XmmVal variables a and b. It then uses the C++ SIMD intrinsic function _mm_sub_epi32 () to perform packed 32-bit integer subtraction. The results are then saved using _mm_store_si128 (). Function SubI64_Iavx() is almost identical to SubI32_Iavx() except that it employs _mm_sub_epi64 () to carry out packed 64-bit integer subtraction. Here are the results for source code example Ch02_02:

Results for SubI32_Iavx

a: 1000000 200 | -30 40000000

b: 100 -200 | 30000 5000

c: 999900 400 | -30030 39995000

Results for SubI64_Iavx

a: 100000000000 | 200

b: 99 | 300000000000

c: 99999999901 | -299999999800

You can use the C++ SIMD intrinsic function _mm_sub_epi8() or _mm_sub_epi16() to perform packed subtraction using 8- or 16-bit wide signed or unsigned integers, respectively. To perform saturated packed subtraction, you can use the C++ SIMD intrinsic function _mm_subs_epi8(), _mm_subs_epi16(), _mm_subs_epu8(), or _mm_subs_epu16().

Integer Multiplication

There are also C++ SIMD intrinsic functions that perform packed integer multiplication. These functions are used somewhat differently than the packed integer addition and subtraction functions you saw in the previous two examples. Listing 2-3 shows the source code for example Ch02_03. This example demonstrates packed multiplication using 16- and 32-bit signed integers.

//------------------------------------------------

// Ch02_03.h

//------------------------------------------------

#pragma once

#include "XmmVal.h"

// Ch02_03_fcpp.cpp

extern void MulI16_Iavx(XmmVal c[2], const XmmVal* a, const XmmVal* b);

extern void MulI32a_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b);

extern void MulI32b_Iavx(XmmVal c[2], const XmmVal* a, const XmmVal* b);

//------------------------------------------------

// Ch02_03.cpp

//------------------------------------------------

#include <iostream>

#include <iomanip>

#include <string>

#include "Ch02_03.h"

static void MulI16(void);

static void MulI32a(void);

static void MulI32b(void);

int main()

{

const char nl = ' ';

std::string sep(75, '-');

MulI16();

std::cout << nl << sep << nl;

MulI32a();

std::cout << nl << sep << nl;

MulI32b();

return 0;

}

static void MulI16(void)

{

const char nl = ' ';

XmmVal a, b, c[2];

a.m_I16[0] = 10; b.m_I16[0] = -5;

a.m_I16[1] = 3000; b.m_I16[1] = 100;

a.m_I16[2] = -2000; b.m_I16[2] = -9000;

a.m_I16[3] = 42; b.m_I16[3] = 1000;

a.m_I16[4] = -5000; b.m_I16[4] = 25000;

a.m_I16[5] = 8; b.m_I16[5] = 16384;

a.m_I16[6] = 10000; b.m_I16[6] = 3500;

a.m_I16[7] = -60; b.m_I16[7] = 6000;

MulI16_Iavx(c, &a, &b);

std::cout << " Results for MulI16_Iavx ";

for (size_t i = 0; i < 8; i++)

{

std::cout << "a[" << i << "]: " << std::setw(8) << a.m_I16[i] << " ";

std::cout << "b[" << i << "]: " << std::setw(8) << b.m_I16[i] << " ";

if (i < 4)

{

std::cout << "c[0][" << i << "]: ";

std::cout << std::setw(12) << c[0].m_I32[i] << nl;

}

else

{

std::cout << "c[1][" << i - 4 << "]: ";

std::cout << std::setw(12) << c[1].m_I32[i - 4] << nl;

}

static void MulI32a(void)

{

const char nl = ' ';

XmmVal a, b, c;

a.m_I32[0] = 10; b.m_I32[0] = -500;

a.m_I32[1] = 3000; b.m_I32[1] = 100;

a.m_I32[2] = -2000; b.m_I32[2] = -12000;

a.m_I32[3] = 4200; b.m_I32[3] = 1000;

MulI32a_Iavx(&c, &a, &b);

std::cout << " Results for MulI32a_Iavx ";

for (size_t i = 0; i < 4; i++)

{

std::cout << "a[" << i << "]: " << std::setw(10) << a.m_I32[i] << " ";

std::cout << "b[" << i << "]: " << std::setw(10) << b.m_I32[i] << " ";

std::cout << "c[" << i << "]: " << std::setw(10) << c.m_I32[i] << nl;

}

static void MulI32b(void)

{

const char nl = ' ';

XmmVal a, b, c[2];

a.m_I32[0] = 10; b.m_I32[0] = -500;

a.m_I32[1] = 3000; b.m_I32[1] = 100;

a.m_I32[2] = -40000; b.m_I32[2] = -120000;

a.m_I32[3] = 4200; b.m_I32[3] = 1000;

MulI32b_Iavx(c, &a, &b);

std::cout << " Results for MulI32b_Iavx ";

for (size_t i = 0; i < 4; i++)

{

std::cout << "a[" << i << "]: " << std::setw(10) << a.m_I32[i] << " ";

std::cout << "b[" << i << "]: " << std::setw(10) << b.m_I32[i] << " ";

if (i < 2)

{

std::cout << "c[0][" << i << "]: ";

std::cout << std::setw(14) << c[0].m_I64[i] << nl;

}

else

{

std::cout << "c[1][" << i - 2 << "]: ";

std::cout << std::setw(14) << c[1].m_I64[i - 2] << nl;

}

//------------------------------------------------

// Ch02_03_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch02_03.h"

void MulI16_Iavx(XmmVal c[2], const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i temp_lo = _mm_mullo_epi16 (a_vals, b_vals);

__m128i temp_hi = _mm_mulhi_epi16 (a_vals, b_vals);

__m128i result_lo = _mm_unpacklo_epi16 (temp_lo, temp_hi);

__m128i result_hi = _mm_unpackhi_epi16 (temp_lo, temp_hi);

_mm_store_si128 ((__m128i*)&c[0], result_lo);

_mm_store_si128 ((__m128i*)&c[1], result_hi);

}

void MulI32a_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c_vals = _mm_mullo_epi32 (a_vals, b_vals);

_mm_store_si128 ((__m128i*)c, c_vals);

}

void MulI32b_Iavx(XmmVal c[2], const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i temp1 = _mm_mul_epi32 (a_vals, b_vals); // q2 | q0

__m128i temp2 = _mm_srli_si128 (a_vals, 4);

__m128i temp3 = _mm_srli_si128 (b_vals, 4);

__m128i temp4 = _mm_mul_epi32 (temp2, temp3); // q3 | q1

*(&c[0].m_I64[0]) = _mm_extract_epi64 (temp1, 0); // q0

*(&c[0].m_I64[1]) = _mm_extract_epi64 (temp4, 0); // q1

*(&c[1].m_I64[0]) = _mm_extract_epi64 (temp1, 1); // q2

*(&c[1].m_I64[1]) = _mm_extract_epi64 (temp4, 1); // q3

}

Listing 2-3

Example Ch02_03

In file Ch02_03.cpp, the C++ function MulI16() contains code that initializes XmmVal variables a and b using 16-bit signed integers. It then calls MulI16_Iavx(), which performs the packed integer multiplication. The results are then streamed to std::cout. The other two static functions in Ch02_03.cpp, MulI32a() and Mul32b(), perform similar initialization tasks for 32-bit packed integer multiplication and then call MulI32a_Iavx() and MulI32b_Iavx(), respectively.

Recall that when performing integer multiplication, the product of two n-bit wide integers is always 2n bits. When performing packed integer multiplication, additional processing is sometimes necessary to form the final products as you will soon see. Function MulI16_Iavx() starts with two _mm_load_si128 () calls that initialize XmmVal variables a and b. It then employs the C++ SIMD intrinsic function _mm_mullo_epi16 () to perform packed 16-bit integer multiplication. This function returns an __m128i value that contains only the low-order 16 bits of each 32-bit product. Function MulI16_Iavx() then uses the C++ SIMD intrinsic function _mm_mulhi_epi16 () to calculate the high-order 16 bits of each 32-bit product. The next function call, _mm_unpacklo_epi16 (), interleaves the four low-order 16-bit integer elements of source operands temp_lo and temp_hi to form 32-bit integer products. The subsequent call to _mm_unpackhi_epi16 () performs a similar operation using the four high-order 16-bit integer elements of the same source operands. Figure 2-1 illustrates the product calculation that MulI16_Iavx() employs.

Figure 2-1
Packed 16-bit signed integer multiplication

Function MulI32a_Iavx() highlights one method of performing packed 32-bit signed integer multiplication. This function uses the C++ SIMD intrinsic function _mm_mullo_epi32 () to calculate the low-order 32 bits of each product. The packed 32-bit integer products are then saved using _mm_store_si128 (). This technique is suitable when calculating multiplicative products that will not exceed the value limits of a 32-bit signed integer.

Function Mul32b_Iavx() demonstrates another technique for calculating products using packed 32-bit signed integers. Following the initialization of a_vals and b_vals, Mul32b_Iavx() uses the C++ SIMD intrinsic function _mm_mul_epi32 () to calculate a.m_I32[0] * b.m_I32[0] and a.m_I32[2] * b.m_I32[2]. Note that _mm_mul_epi32 () computes complete 64-bit products. The subsequent _mm_srli_si128 () and _mm_srli_si128 () calls right shift a_vals and b_vals by 4 bytes (zeros are shifted in). The reason for these shifts is that _mm_mul_epi32 () carries out its calculations using only the even numbered elements of its two operands. The right shift operations facilitate calculation of the products a.m_I32[1] * b.m_I32[1] and a.m_I32[3] * b.m_I32[3] by the ensuing call to _mm_mul_epi32 () as shown in Figure 2-2. The final 64-bit products are then extracted from temp1 and temp4 using a series of calls to _mm_extract_epi64 (). The constant that is specified in each _mm_extract_epi64 () call designates which 64-bit integer to extract from the 128-bit wide source operand.

Figure 2-2
Packed 32-bit signed integer multiplication

Here are the results for source code example Ch02_03:

Results for MulI16_Iavx

a[0]: 10 b[0]: -5 c[0][0]: -50

a[1]: 3000 b[1]: 100 c[0][1]: 300000

a[2]: -2000 b[2]: -9000 c[0][2]: 18000000

a[3]: 42 b[3]: 1000 c[0][3]: 42000

a[4]: -5000 b[4]: 25000 c[1][0]: -125000000

a[5]: 8 b[5]: 16384 c[1][1]: 131072

a[6]: 10000 b[6]: 3500 c[1][2]: 35000000

a[7]: -60 b[7]: 6000 c[1][3]: -360000

---------------------------------------------------------------------------

Results for MulI32a_Iavx

a[0]: 10 b[0]: -500 c[0]: -5000

a[1]: 3000 b[1]: 100 c[1]: 300000

a[2]: -2000 b[2]: -12000 c[2]: 24000000

a[3]: 4200 b[3]: 1000 c[3]: 4200000

---------------------------------------------------------------------------

Results for MulI32b_Iavx

a[0]: 10 b[0]: -500 c[0][0]: -5000

a[1]: 3000 b[1]: 100 c[0][1]: 300000

a[2]: -40000 b[2]: -120000 c[1][0]: 4800000000

a[3]: 4200 b[3]: 1000 c[1][1]: 4200000

Integer Bitwise Logical and Shift Operations

Besides standard arithmetic operations, x86-AVX also supports other common operations using 128-bit wide packed integer operands. In this section, you will learn how to carry out bitwise logical and shift operations.

Bitwise Logical Operations

Listing 2-4 shows the source code for example Ch02_04. This example demonstrates how to perform bitwise logical (e.g., AND, OR, and exclusive OR) operations using packed integer operands.

//------------------------------------------------

// Ch02_04.h

//------------------------------------------------

#pragma once

#include "XmmVal.h"

// Ch02_04_fcpp.cpp

extern void AndU16_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b);

extern void OrU16_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b);

extern void XorU16_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b);

//------------------------------------------------

// Ch02_04.cpp

//------------------------------------------------

#include <iostream>

#include "Ch02_04.h"

static void BitwiseLogical(void);

int main()

{

BitwiseLogical();

return 0;

}

static void BitwiseLogical(void)

{

XmmVal a, b, c;

const char nl = ' ';

a.m_U16[0] = 0x1234; b.m_U16[0] = 0xFF00;

a.m_U16[1] = 0xABDC; b.m_U16[1] = 0x00FF;

a.m_U16[2] = 0xAA55; b.m_U16[2] = 0xAAAA;

a.m_U16[3] = 0x1111; b.m_U16[3] = 0x5555;

a.m_U16[4] = 0xFFFF; b.m_U16[4] = 0x8000;

a.m_U16[5] = 0x7F7F; b.m_U16[5] = 0x7FFF;

a.m_U16[6] = 0x9876; b.m_U16[6] = 0xF0F0;

a.m_U16[7] = 0x7F00; b.m_U16[7] = 0x0880;

AndU16_Iavx(&c, &a, &b);

std::cout << " Results for AndU16_Iavx ";

std::cout << "a: " << a.ToStringX16() << nl;

std::cout << "b: " << b.ToStringX16() << nl;

std::cout << "c: " << c.ToStringX16() << nl;

OrU16_Iavx(&c, &a, &b);

std::cout << " Results for OrU16_Iavx ";

std::cout << "a: " << a.ToStringX16() << nl;

std::cout << "b: " << b.ToStringX16() << nl;

std::cout << "c: " << c.ToStringX16() << nl;

XorU16_Iavx(&c, &a, &b);

std::cout << " Results for XorU16_Iavx ";

std::cout << "a: " << a.ToStringX16() << nl;

std::cout << "b: " << b.ToStringX16() << nl;

std::cout << "c: " << c.ToStringX16() << nl;

}

//------------------------------------------------

// Ch02_04_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch02_04.h"

void AndU16_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c_vals = _mm_and_si128(a_vals, b_vals);

_mm_store_si128 ((__m128i*)c, c_vals);

}

void OrU16_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c_vals = _mm_or_si128 (a_vals, b_vals);

_mm_store_si128 ((__m128i*)c, c_vals);

}

void XorU16_Iavx(XmmVal* c, const XmmVal* a, const XmmVal* b)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_load_si128 ((__m128i*)b);

__m128i c_vals = _mm_xor_si128 (a_vals, b_vals);

_mm_store_si128 ((__m128i*)c, c_vals);

}

Listing 2-4

Example Ch02_04

In Listing 2-4, the function BitwiseLogical() initializes XmmVal variables a and b as test values. It then invokes the calculating functions AndU16_Iavx(), OrU16_Iavx(), and XorU16_Iavx(). These functions call the corresponding C++ SIMD intrinsic functions _mm_and_si128 (), _mm_or_si128 (), and _mm_xor_si128 (). Unlike packed integer arithmetic operations, there are no distinct integer elements in a packed bitwise logical operation. The processor simply carries out the specified Boolean operation using the corresponding bit positions of each SIMD operand. Here are the results for source code example Ch02_04:

Results for AndU16_Iavx

a: 1234 ABDC AA55 1111 | FFFF 7F7F 9876 7F00

b: FF00 00FF AAAA 5555 | 8000 7FFF F0F0 0880

c: 1200 00DC AA00 1111 | 8000 7F7F 9070 0800

Results for OrU16_Iavx

a: 1234 ABDC AA55 1111 | FFFF 7F7F 9876 7F00

b: FF00 00FF AAAA 5555 | 8000 7FFF F0F0 0880

c: FF34 ABFF AAFF 5555 | FFFF 7FFF F8F6 7F80

Results for XorU16_Iavx

a: 1234 ABDC AA55 1111 | FFFF 7F7F 9876 7F00

b: FF00 00FF AAAA 5555 | 8000 7FFF F0F0 0880

c: ED34 AB23 00FF 4444 | 7FFF 0080 6886 7780

Shift Operations

Listing 2-5 shows the source code for example Ch02_05. This example illustrates use of several C++ SIMD intrinsic functions that perform logical and arithmetic shift operations.

//------------------------------------------------

// Ch02_05.h

//------------------------------------------------

#pragma once

#include "XmmVal.h"

// Ch02_05_fcpp.cpp

extern void SllU16_Iavx(XmmVal* c, const XmmVal* a, int count);

extern void SrlU16_Iavx(XmmVal* c, const XmmVal* a, int count);

extern void SraU16_Iavx(XmmVal* c, const XmmVal* a, int count);

//------------------------------------------------

// Ch02_05.cpp

//------------------------------------------------

#include <iostream>

#include "Ch02_05.h"

static void ShiftU16(void);

int main()

{

ShiftU16();

return 0;

}

static void ShiftU16(void)

{

XmmVal a, c;

const int count_l = 8;

const int count_r = 4;

const char nl = ' ';

a.m_U16[0] = 0x1234;

a.m_U16[1] = 0xFFB0;

a.m_U16[2] = 0x00CC;

a.m_U16[3] = 0x8080;

a.m_U16[4] = 0x00FF;

a.m_U16[5] = 0xAAAA;

a.m_U16[6] = 0x0F0F;

a.m_U16[7] = 0x0101;

SllU16_Iavx(&c, &a, count_l);

std::cout << " Results for SllU16_Aavx - count = " << count_l << nl;

std::cout << "a: " << a.ToStringX16() << nl;

std::cout << "c: " << c.ToStringX16() << nl;

SrlU16_Iavx(&c, &a, count_r);

std::cout << " Results for SrlU16_Aavx - count = " << count_r << nl;

std::cout << "a: " << a.ToStringX16() << nl;

std::cout << "c: " << c.ToStringX16() << nl;

SraU16_Iavx(&c, &a, count_r);

std::cout << " Results for SraU16_Aavx - count = " << count_r << nl;

std::cout << "a: " << a.ToStringX16() << nl;

std::cout << "c: " << c.ToStringX16() << nl;

}

//------------------------------------------------

// Ch02_05_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch02_05.h"

void SllU16_Iavx(XmmVal* c, const XmmVal* a, int count)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_slli_epi16 (a_vals, count);

_mm_store_si128 ((__m128i*)c, b_vals);

}

void SrlU16_Iavx(XmmVal* c, const XmmVal* a, int count)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_srli_epi16 (a_vals, count);

_mm_store_si128 ((__m128i*)c, b_vals);

}

void SraU16_Iavx(XmmVal* c, const XmmVal* a, int count)

{

__m128i a_vals = _mm_load_si128 ((__m128i*)a);

__m128i b_vals = _mm_srai_epi16 (a_vals, count);

_mm_store_si128 ((__m128i*)c, b_vals);

}

Listing 2-5

Example Ch02_05

Source code example Ch02_05 begins its execution with the initialization of XmmVal variable a using 16-bit wide integer elements. It then executes several calculating functions that perform various shift operations. The function SllU16_Iavx() uses the C++ SIMD intrinsic function _mm_slli_epi16 () to left shift each 16-bit integer element by count bits. Similarly, function SrlU16_Iavx() employs the C++ SIMD intrinsic function _mm_srli_epi16 () to logically right shift each 16-bit integer element by count bits. Note that both _mm_slli_epi16 () and _mm_srli_epi16 () shift in zeros for each element from the left or right, respectively. The final calculation is carried out by SraU16_Iavx(), which uses the C++ SIMD intrinsic function _mm_srai_epi16 () to perform an arithmetic right shift of each 16-bit integer element. Recall that an integer arithmetic right shift fills empty bit positions using the value of the sign bit. Here are the results for source code example Ch02_05:

Results for SllU16_Aavx - count = 8

a: 1234 FFB0 00CC 8080 | 00FF AAAA 0F0F 0101

c: 3400 B000 CC00 8000 | FF00 AA00 0F00 0100

Results for SrlU16_Aavx - count = 4

a: 1234 FFB0 00CC 8080 | 00FF AAAA 0F0F 0101

c: 0123 0FFB 000C 0808 | 000F 0AAA 00F0 0010

Results for SraU16_Aavx - count = 4

a: 1234 FFB0 00CC 8080 | 00FF AAAA 0F0F 0101

c: 0123 FFFB 000C F808 | 000F FAAA 00F0 0010

C++ SIMD Intrinsic Function Naming Conventions

If you haven’t done so already, now might be a good time to take a quick look at the Intel Intrinsics Guide website (https://software.intel.com/sites/landingpage/IntrinsicsGuide/). This site includes comprehensive information that describes the operation of all C++ SIMD instruction functions. When exploring the Intel Intrinsics Guide website, it is easy to get overwhelmed by the number of available C++ SIMD (and non-SIMD) intrinsic functions. Fortunately, most of the C++ SIMD intrinsic functions and data types follow a straightforward naming convention. Functions are named using a <prefix>_<intrinop>_<suffix> pattern. The <intrinop> component is a short descriptive text string that describes the operation (e.g., add, sub, and, xor, etc.). Table 2-1 shows the text strings that are used for the <prefix> and <suffix> components of a function name. Most AVX-512 C++ SIMD intrinsic function names also include the text string _mask_ or _maskz_ to signify that they require an integer mask operand.

Table 2-1

C++ SIMD Intrinsic Function Name Prefixes and Suffixes

String	Type	Description
_mm	Prefix	X86-AVX function that uses 128-bit wide operands
_mm256	Prefix	X86-AVX function that uses 256-bit wide operands
_mm512	Prefix	X86-AVX function that uses 512-bit wide operands
_epi8	Suffix	Packed 8-bit signed integers
_epi16	Suffix	Packed 16-bit signed integers
_epi32	Suffix	Packed 32-bit signed integers
_epi64	Suffix	Packed 64-bit signed integers
_epu8	Suffix	Packed 8-bit unsigned integers
_epu16	Suffix	Packed 16-bit unsigned integers
_epu32	Suffix	Packed 32-bit unsigned integers
_epu64	Suffix	Packed 64-bit signed integers
_ss	Suffix	Scalar single-precision floating-point
_sd	Suffix	Scalar double-precision floating-point
_ps	Suffix	Packed single-precision floating-point
_pd	Suffix	Packed double-precision floating-point

It should be noted that many of the C++ SIMD intrinsic functions that carry out their operations using 128-bit wide SIMD operands will also execute on processors that support SSE, SSE2, SSE3, SSSE3, SSE4.1, or SSE4.2. For more information, you can consult the previously mentioned Intel Intrinsics Guide website.

Table 2-2 lists the principal C++ SIMD intrinsic data types.

Table 2-2

C++ SIMD Intrinsic Data Types

Type	Description
__m128	128-bit wide packed single-precision floating-point
__m128d	128-bit wide packed double-precision floating-point
__m128i	128-bit wide packed integers
__m256	256-bit wide packed single-precision floating-point
__m256d	256-bit wide packed double-precision floating-point
__m256i	256-bit wide packed integers
__m512	512-bit wide packed single-precision floating-point
__m512d	512-bit wide packed single-precision floating-point
__m512i	512-bit wide packed integers

It is important to keep in mind that none of the C++ SIMD intrinsic functions and data types are defined in any of the ISO C++ standards. Minor discrepancies exist between mainstream compilers such as Visual C++ and GNU C++. Also, these compilers employ different techniques to implement the various SIMD functions and data types. If you are developing code that needs to work on multiple platforms, you should avoid directly referencing any of the internal members of the data types shown in Table 2-2. You can employ the portable SIMD data types used in this book (e.g., XmmVal) or define your own portable SIMD data type.

Image Processing Algorithms

The source code examples presented thus far were designed to familiarize you with basic C++ SIMD intrinsic functions and common packed integer operations. To fully exploit the performance benefits of x86-AVX, you must develop complete SIMD functions. The source code examples in this section explain how to code a few simple image processing functions.

In the first example, you will learn how to utilize x86-AVX and C++ SIMD intrinsic functions to find the minimum and maximum value in an array of 8-bit unsigned integers. This example has real-world utility since digital images are often arranged in memory using arrays or matrices of 8-bit unsigned integers. Also, many image processing algorithms (e.g., contrast enhancement) often need to ascertain the minimum (darkest) and maximum (brightest) pixel values in an image. The second source code example illustrates how to calculate the mean value of an array of 8-bit unsigned integers using SIMD arithmetic. This is another example of a realistic algorithm that is directly relevant to the province of image processing. Finally, you will learn some straightforward techniques for benchmarking the performance of a SIMD function.

Pixel Minimum and Maximum

Source code example Ch02_06 demonstrates how to find the minimum and maximum values in an array of 8-bit unsigned integers. This example also illuminates the use of dynamically allocated arrays that are properly aligned for use in x86-AVX calculating functions and some performance benchmarking techniques. Listing 2-6 shows the source code for example Ch02_06.

//------------------------------------------------

// Ch02_06.h

//------------------------------------------------

#pragma once

#include <cstddef>

#include <cstdint>

// Ch02_06_fcpp.cpp

extern bool CalcMinMaxU8_Cpp(uint8_t* x_min, uint8_t* x_max, const uint8_t* x, size_t n);

extern bool CalcMinMaxU8_Iavx(uint8_t* x_min, uint8_t* x_max, const uint8_t* x, size_t n);

// Ch02_06_misc.cpp

extern void InitArray(uint8_t* x, size_t n, unsigned int rng_seed);

// Ch02_06_BM.cpp

extern void CalcMinMaxU8_bm();

// c_NumElements must be > 0 and even multiple of 16

const size_t c_NumElements = 10000000;

const unsigned int c_RngSeedVal = 23;

//------------------------------------------------

// Ch02_06_misc.cpp

//------------------------------------------------

#include "Ch02_06.h"

#include "MT.h"

void InitArray(uint8_t* x, size_t n, unsigned int rng_seed)

{

int rng_min_val = 5;

int rng_max_val = 250;

MT::FillArray(x, n, rng_min_val, rng_max_val, rng_seed);

// Use known values for min & max (for test purposes)

x[(n / 4) * 3 + 1] = 2;

x[n / 4 + 11] = 3;

x[n / 2] = 252;

x[n / 2 + 13] = 253;

x[n / 8 + 5] = 4;

x[n / 8 + 7] = 254;

}

//------------------------------------------------

// Ch02_06.cpp

//------------------------------------------------

#include <iostream>

#include "Ch02_06.h"

#include "AlignedMem.h"

static void CalcMinMaxU8();

int main()

{

CalcMinMaxU8();

CalcMinMaxU8_bm();

}

static void CalcMinMaxU8()

{

size_t n = c_NumElements;

AlignedArray<uint8_t> x_aa(n, 16);

uint8_t* x = x_aa.Data();

InitArray(x, n, c_RngSeedVal);

uint8_t x_min1 = 0, x_max1 = 0;

uint8_t x_min2 = 0, x_max2 = 0;

bool rc1 = CalcMinMaxU8_Cpp(&x_min1, &x_max1, x, n);

bool rc2 = CalcMinMaxU8_Iavx(&x_min2, &x_max2, x, n);

std::cout << " Results for CalcMinMaxU8_Cpp ";

std::cout << "rc1: " << rc1 << " x_min1: " << (int)x_min1;

std::cout << " x_max1: " << (int)x_max1 << ' ';

std::cout << " Results for CalcMinMaxU8_Iavx ";

std::cout << "rc2: " << rc2 << " x_min2: " << (int)x_min2;

std::cout << " x_max2: " << (int)x_max2 << ' ';

}

//------------------------------------------------

// Ch02_06_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch02_06.h"

#include "AlignedMem.h"

bool CalcMinMaxU8_Cpp(uint8_t* x_min, uint8_t* x_max, const uint8_t* x, size_t n)

{

if (n == 0 || ((n % 16) != 0))

return false;

if (!AlignedMem::IsAligned(x, 16))

return false;

uint8_t min_val = 0xff;

uint8_t max_val = 0;

for (size_t i = 0; i < n; i++)

{

uint8_t x_val = x[i];

if (x_val < min_val)

min_val = x_val;

else if (x_val > max_val)

max_val = x_val;

}

*x_min = min_val;

*x_max = max_val;

return true;

}

bool CalcMinMaxU8_Iavx(uint8_t* x_min, uint8_t* x_max, const uint8_t* x, size_t n)

{

if (n == 0 || ((n % 16) != 0))

return false;

if (!AlignedMem::IsAligned(x, 16))

return false;

__m128i min_vals = _mm_set1_epi8 ((char)0xff);

__m128i max_vals = _mm_setzero_si128 ();

const size_t num_simd_elements = 16;

for (size_t i = 0; i < n; i += num_simd_elements)

{

__m128i x_vals = _mm_load_si128 ((__m128i*)&x[i]);

min_vals = _mm_min_epu8 (x_vals, min_vals);

max_vals = _mm_max_epu8 (x_vals, max_vals);

}

__m128i temp1, temp2, temp3, temp4;

__m128i vals_r1, vals_r2, vals_r3, vals_r4;

// Reduce min_vals to final min_val

temp1 = _mm_srli_si128 (min_vals, 8);

vals_r1 = _mm_min_epu8 (min_vals, temp1);

temp2 = _mm_srli_si128 (vals_r1, 4);

vals_r2 = _mm_min_epu8 (vals_r1, temp2);

temp3 = _mm_srli_si128 (vals_r2, 2);

vals_r3 = _mm_min_epu8 (vals_r2, temp3);

temp4 = _mm_srli_si128 (vals_r3, 1);

vals_r4 = _mm_min_epu8 (vals_r3, temp4);

*x_min = (uint8_t)_mm_extract_epi8 (vals_r4, 0);

// Reduce max_vals to final max_val

temp1 = _mm_srli_si128 (max_vals, 8);

vals_r1 = _mm_max_epu8 (max_vals, temp1);

temp2 = _mm_srli_si128 (vals_r1, 4);

vals_r2 = _mm_max_epu8 (vals_r1, temp2);

temp3 = _mm_srli_si128 (vals_r2, 2);

vals_r3 = _mm_max_epu8 (vals_r2, temp3);

temp4 = _mm_srli_si128 (vals_r3, 1);

vals_r4 = _mm_max_epu8 (vals_r3, temp4);

*x_max = (uint8_t)_mm_extract_epi8 (vals_r4, 0);

return true;

}

//------------------------------------------------

// Ch02_06_bm.cpp

//------------------------------------------------

#include <iostream>

#include "Ch02_06.h"

#include "AlignedMem.h"

#include "BmThreadTimer.h"

void CalcMinMaxU8_bm(void)

{

std::cout << " Running benchmark function CalcMinMaxU8_bm - please wait ";

size_t n = c_NumElements;

AlignedArray<uint8_t> x_aa(n, 16);

uint8_t* x = x_aa.Data();

InitArray(x, n, c_RngSeedVal);

uint8_t x_min0 = 0, x_max0 = 0;

uint8_t x_min1 = 0, x_max1 = 0;

const size_t num_it = 500;

const size_t num_alg = 2;

BmThreadTimer bmtt(num_it, num_alg);

for (size_t i = 0; i < num_it; i++)

{

bmtt.Start(i, 0);

CalcMinMaxU8_Cpp(&x_min0, &x_max0, x, n);

bmtt.Stop(i, 0);

bmtt.Start(i, 1);

CalcMinMaxU8_Iavx(&x_min1, &x_max1, x, n);

bmtt.Stop(i, 1);

}

std::string fn = bmtt.BuildCsvFilenameString("Ch02_06_CalcMinMaxU8_bm");

bmtt.SaveElapsedTimes(fn, BmThreadTimer::EtUnit::MicroSec, 2);

std::cout << "Benchmark times save to file " << fn << ' ';

}

Listing 2-6

Example Ch02_06

The first file in Listing 2-6 is Ch02_06.h. This file contains the requisite function declarations and a few miscellaneous constants. Note that the function declarations use the fixed-width integer type uint8_t, which is defined in the header file <cstdint>. Some programmers (including me) prefer to use the fixed-width integer types in SIMD calculating functions since it eschews the size ambiguities of the standard C++ integer types char, short, int, long, and long long.

The next file in Listing 2-6, Ch02_06_misc.cpp, contains a simple function named InitArray(). This function fills an array of 8-bit unsigned integers using random values. The actual filling of the array is performed by a template function named MT::FillArray(), which is defined in the header file MT.h. The driver function for this example is named CalcMinMaxU8() and is defined in Ch02_06.cpp. Near the top of CalcMinMaxU8() is the statement AlignedArray <uint8_t> x_aa(n, 16). This statement dynamically allocates an n element array of uint8_t integers that is aligned on a 16-byte boundary. The source code for both MT.h and AlignedMem.h (which contains the template class AlignedMem<T>) is not shown in Listing 2-6 but is included in the software download package.

The principal calculating functions for example Ch02_06 are defined in the file Ch02_06_fcpp.cpp. The first function in this module, CalcMinMaxU8_Cpp(), finds the minimum and maximum value in an array of uint8_t integers. This function is coded using typical C++ statements sans any C++ SIMD intrinsic functions and will be used later for comparison and benchmarking purposes. Note that prior to the start of the for-loop, two error checks are performed. The first error check ensures that n is not equal to zero and an integral multiple of 16. Requiring n to be an integral multiple of 16 is not as restrictive as it might appear since the number of pixels in a digital camera image is often an integral multiple of 64 due to the processing requirements of the JPEG algorithms. Later examples will include additional code that can process arrays of any size. The second error check ensures that the source pixel buffer x is properly aligned on a 16-byte boundary.

The SIMD counterpart function to CalcMinMaxU8_Cpp() is named CalcMinMaxU8_Iavx(). This function starts its execution by validating n for size and x for proper alignment. The next statement uses the C++ SIMD intrinsic function _mm_set1_epi8 () to set each 8-bit element in min_vals to 0xFF. This is also known as a broadcast operation. Unlike the non-SIMD min-max function, the for-loop in CalcMinMaxU8_Iavx() maintains 16 intermediate pixel minimums as it sweeps through pixel buffer x and the variable min_val holds these values. The next statement uses _mm_setzero_si128 () to initialize each 8-bit element in max_vals to 0x00. This variable holds intermediate pixel maximums during execution of the for-loop.

Immediately before the start of the for-loop, the constant variable num_simd_elements is set to 16 and represents the number of pixels processed during each iteration of the for-loop. This is the reason for requiring n to be an integral multiple of 16. The for-loop itself begins with a call to _mm_load_si128 () that loads 16 8-bit unsigned integer elements from pixel buffer x into x_vals. The next statement uses the C++ SIMD function _mm_min_epu8 () to update the intermediate minimum values maintained in min_vals. This function compares the two 128-bit wide operands of 8-bit unsigned integers and returns an __m128i value that contains the minimum value of each element compare as shown in Figure 2-3. The C++ SIMD intrinsic function _mm_max_epu8 () performs the corresponding 8-bit unsigned integer packed maximum compare and is also shown in Figure 2-3.

Figure 2-3
Pixel minimum and maximum compares using _mm_min_epu8 () and _mm_max_epu8 ()

Following execution of the for-loop, packed variables min_vals and max_vals contain the final 16 minimum and maximum values, respectively. The true pixel minimum and maximum value is one of the elements in these variables. To find the true minimum and maximum values, both min_vals and max_vals must be reduced to a single scalar value. For min_vals, this is accomplished using a series of _mm_srli_si128 () and _mm_min_epu8 () calls as shown in Figure 2-4. The code that reduces max_vals to a scalar value is almost identical except that it uses the C++ SIMD intrinsic function _mm_max_epu8 () instead. Following each reduction operation, CalcMinMaxU8_Iavx() employs _mm_extract_epi8 () to extract the final 8-bit unsigned integer value.

Figure 2-4
Reduction of min_vals using _mm_srli_i128() and _mm_min_epu8 ()

Here are the results for source code example Ch02_06:

Results for CalcMinMaxU8_Cpp

rc1: 1 x_min1: 2 x_max1: 254

Results for CalcMinMaxU8_Iavx

rc2: 1 x_min2: 2 x_max2: 254

Running benchmark function CalcMinMaxU8_bm - please wait

Benchmark times save to file Ch02_06_CalcMinMaxU8_bm_LITHIUM.csv

The final function in Listing 2-6 is named CalcMinMaxU8_bm(). This function contains code that measures the execution times of functions CalcMinMaxU8_Cpp() and CalcMinMaxU8_Iavx(). Most of the timing measurement code is encapsulated in a C++ class named BmThreadTimer . This class includes two member functions, BmThreadTimer::Start() and BmThreadTimer::Stop(), that implement a simple software stopwatch. Class BmThreadTimer also includes a member function named BmThreadTimer::SaveElapsedTimes(), which saves the timing measurements to a comma-separated text file. The source code for class BmThreadTimer is not shown in Listing 2-6 but included as part of the source code download package.

Table 2-3 contains benchmark timing measurements for the functions CalcMinMaxU8_Cpp() and CalcMinMaxU8_Iavx() using two different Intel processors. These measurements were made with an EXE file that was built using the Visual C++ Release configuration and the default settings for code optimization (which includes the /O2 switch for maximum optimization, favor speed) except for the following options: AVX code generation (/arch:AVX) was selected to facilitate “apples-to-apples” comparisons between the standard C++ and C++ SIMD intrinsic function code (the default code generation option for a 64-bit Visual C++ EXE file is SSE2); whole program optimization was disabled to ensure accurate C++ function timing measurements. All timing measurements were made using ordinary desktop PCs running Windows 10. No attempt was made to account for any hardware, software, operating system, or other configuration differences between the PCs prior to running the benchmark executable file. Neither CPU was overclocked. The benchmarking test conditions described in this section are also used in subsequent chapters.

Table 2-3

Pixel Minimum and Maximum Execution Times (Microseconds), 10,000,000 pixels

CPU	CalcMinMaxU8_Cpp()	CalcMinMaxU8_Iavx()
Intel Core i7-8700K	6549	406
Intel Core i5-11600K	6783	304

The values shown in Table 2-3 were computed using the CSV file execution times (500 runs of each algorithm) and the Excel spreadsheet function TRIMMEAN(array,0.10). In example Ch02_06, the C++ SIMD intrinsic function implementation of the pixel minimum-maximum algorithm clearly outperforms the standard C++ version by a wide margin. It is not uncommon to achieve significant speed improvements when using C++ intrinsic functions, especially by algorithms that can fully exploit the SIMD parallelism of an x86 processor. You will see additional examples of accelerated algorithmic performance throughout the remainder of this book.

The benchmark timing measurements cited in this book provide reasonable approximations of function execution times. They are intended to provide some insights regarding the performance of a function coded using standard C++ statements vs. a function coded using C++ SIMD intrinsic functions. Like automobile fuel economy and mobile device battery runtime estimates, software performance benchmarking is not an exact science and subject to a variety of uncontrollable factors. It is also important to keep mind that this book is an introductory primer about x86 SIMD programming and not benchmarking. The source code examples are structured to hasten the study of x86 SIMD programming techniques. In addition, the Visual C++ options described earlier were selected mostly for practical reasons and may not yield optimal performance in all cases. Both Visual C++ and GNU C++ include a plethora of code generation options that can affect performance. Benchmark timing measurements should always be construed in a context that is correlated with the software’s purpose. The methods described in this section are generally worthwhile, but measurement results occasionally vary. Appendix A contains additional information about the software tools used to develop the source code examples in this book.

Pixel Mean Intensity

The final source code example of this chapter, Ch02_07, demonstrates how to calculate the arithmetic mean of an array of 8-bit unsigned integers using packed integer arithmetic. Listing 2-7 shows the source code for example Ch02_07.

//------------------------------------------------

// Ch02_07.h

//------------------------------------------------

#pragma once

#include <cstddef>

#include <cstdint>

// Ch02_07_fcpp.cpp

extern bool CalcMeanU8_Cpp(double* mean_x, uint64_t* sum_x, const uint8_t* x, size_t n);

extern bool CalcMeanU8_Iavx(double* mean_x, uint64_t* sum_x, const uint8_t* x, size_t n);

// Ch02_07_misc.cpp

extern void InitArray(uint8_t* x, size_t n, unsigned int seed);

extern bool CheckArgs(const uint8_t* x, size_t n);

// Ch02_07_bm.cpp

extern void CalcMeanU8_bm(void);

// Miscellaneous constants

const size_t c_NumElements = 10000000;

const size_t c_Alignment = 16;

const unsigned int c_RngSeedVal = 29;

extern "C" size_t g_NumElementsMax;

//------------------------------------------------

// Ch02_07_misc.cpp

//------------------------------------------------

#include "Ch02_07.h"

#include "MT.h"

#include "AlignedMem.h"

size_t g_NumElementsMax = 64 * 1024 * 1024;

bool CheckArgs(const uint8_t* x, size_t n)

{

if (n == 0 || n > g_NumElementsMax)

return false;

if ((n % 64) != 0)

return false;

if (!AlignedMem::IsAligned(x, c_Alignment))

return false;

return true;

}

void InitArray(uint8_t* x, size_t n, unsigned int rng_seed)

{

int rng_min_val = 0;

int rng_max_val = 255;

MT::FillArray(x, n, rng_min_val, rng_max_val, rng_seed);

}

//------------------------------------------------

// Ch02_07.cpp

//------------------------------------------------

#include <iostream>

#include <iomanip>

#include "Ch02_07.h"

#include "AlignedMem.h"

static void CalcMeanU8(void);

int main()

{

CalcMeanU8();

CalcMeanU8_bm();

}

static void CalcMeanU8(void)

{

const char nl = ' ';

const size_t n = c_NumElements;

AlignedArray<uint8_t> x_aa(n, 16);

uint8_t* x = x_aa.Data();

InitArray(x, n, c_RngSeedVal);

bool rc0, rc1;

uint64_t sum_x0, sum_x1;

double mean_x0, mean_x1;

rc0 = CalcMeanU8_Cpp(&mean_x0, &sum_x0, x, n);

rc1 = CalcMeanU8_Iavx(&mean_x1, &sum_x1, x, n);

std::cout << std::fixed << std::setprecision(6);

std::cout << " Results for CalcMeanU8_Cpp ";

std::cout << "rc0: " << rc0 << " ";

std::cout << "sum_x0: " << sum_x0 << " ";

std::cout << "mean_x0: " << mean_x0 << nl;

std::cout << " Results for CalcMeanU8_Iavx ";

std::cout << "rc1: " << rc1 << " ";

std::cout << "sum_x1: " << sum_x1 << " ";

std::cout << "mean_x1: " << mean_x1 << nl;

}

//------------------------------------------------

// Ch02_07_fcpp.cpp

//------------------------------------------------

#include <immintrin.h>

#include "Ch02_07.h"

bool CalcMeanU8_Cpp(double* mean_x, uint64_t* sum_x, const uint8_t* x, size_t n)

{

if (!CheckArgs(x, n))

return false;

uint64_t sum_x_temp = 0;

for (size_t i = 0; i < n; i++)

sum_x_temp += x[i];

*sum_x = sum_x_temp;

*mean_x = (double)sum_x_temp / n;

return true;

}

bool CalcMeanU8_Iavx(double* mean_x, uint64_t* sum_x, const uint8_t* x, size_t n)

{

if (!CheckArgs(x, n))

return false;

const size_t num_simd_elements = 16;

__m128i packed_zero = _mm_setzero_si128 ();

__m128i pixel_sums_u32 = _mm_setzero_si128 ();

for (size_t i = 0; i < n; i += num_simd_elements * 4)

{

__m128i pixel_vals_u8, pixel_vals_lo_u16, pixel_vals_hi_u16;

__m128i pixel_sums_u16 = _mm_setzero_si128 ();

// Process pixels x[i:i+15]

pixel_vals_u8 = _mm_load_si128 ((__m128i*)&x[i]);

pixel_vals_lo_u16 = _mm_unpacklo_epi8 (pixel_vals_u8, packed_zero);

pixel_vals_hi_u16 = _mm_unpackhi_epi8 (pixel_vals_u8, packed_zero);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_lo_u16);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_hi_u16);

// Process pixels x[i+16:i+31]

pixel_vals_u8 = _mm_load_si128 ((__m128i*)&x[i + 16]);

pixel_vals_lo_u16 = _mm_unpacklo_epi8 (pixel_vals_u8, packed_zero);

pixel_vals_hi_u16 = _mm_unpackhi_epi8 (pixel_vals_u8, packed_zero);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_lo_u16);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_hi_u16);

// Process pixels x[i+32:i+47]

pixel_vals_u8 = _mm_load_si128 ((__m128i*)&x[i + 32]);

pixel_vals_lo_u16 = _mm_unpacklo_epi8 (pixel_vals_u8, packed_zero);

pixel_vals_hi_u16 = _mm_unpackhi_epi8 (pixel_vals_u8, packed_zero);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_lo_u16);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_hi_u16);

// Process pixels x[i+48:i+63]

pixel_vals_u8 = _mm_load_si128 ((__m128i*)&x[i + 48]);

pixel_vals_lo_u16 = _mm_unpacklo_epi8 (pixel_vals_u8, packed_zero);

pixel_vals_hi_u16 = _mm_unpackhi_epi8 (pixel_vals_u8, packed_zero);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_lo_u16);

pixel_sums_u16 = _mm_add_epi16 (pixel_sums_u16, pixel_vals_hi_u16);

// Convert pixel_sums_u16 to u32, then update pixel_sums_u32

__m128i pixel_sums_lo_u32 = _mm_unpacklo_epi16 (pixel_sums_u16, packed_zero);

__m128i pixel_sums_hi_u32 = _mm_unpackhi_epi16 (pixel_sums_u16, packed_zero);

pixel_sums_u32 = _mm_add_epi32(pixel_sums_u32, pixel_sums_lo_u32);

pixel_sums_u32 = _mm_add_epi32(pixel_sums_u32, pixel_sums_hi_u32);

}

// Reduce pixel_sums_u32 to single sum value

uint64_t pixel_sum = _mm_extract_epi32 (pixel_sums_u32, 0);

pixel_sum += _mm_extract_epi32 (pixel_sums_u32, 1);

pixel_sum += _mm_extract_epi32 (pixel_sums_u32, 2);

pixel_sum += _mm_extract_epi32 (pixel_sums_u32, 3);

// Calculate mean

*sum_x = pixel_sum;

*mean_x = (double)pixel_sum / n;

return true;

}

Listing 2-7

Example Ch02_07

The organization of the code in this example is similar to Ch02_06. File Ch02_07_misc.cpp contains two functions, CheckArgs() and InitArray(), which perform argument checking and array initialization, respectively. Note that the size of the source array must be an integral multiple of 64 and aligned on a 16-byte boundary. The function CheckArgs() also verifies that the number of array elements n is less than g_NumElementsMax. This size restriction enables the C++ SIMD code to perform intermediate calculations using packed 32-bit unsigned integers without any safeguards for arithmetic overflows.

Calculation of an array mean is straightforward; a function must sum the elements of the array and then divide this sum by the total number of elements. The function CalcMeanU8_Cpp() accomplishes this using a simple for-loop and scalar floating-point division.

The C++ SIMD counterpart function is named CalcMeanU8_Iavx(). Following argument validation using CheckArgs(), CalcMeanU8_Iavx() initializes variables packed_zero and pixel_sums_u32 to all zeros. The former variable is employed by the for-loop to perform unsigned integer size promotions, and the latter maintains four 32-bit unsigned integer intermediate sum values. The main for-loop is next. Note that each for-loop iteration processes 64 array elements since the index variable i is incremented by num_simd_elements * 4. The reason for doing this is that it reduces the number of 8-bit to 32-bit unsigned integer size promotions required to calculate the final pixel sum.

The first executable statement of the for-loop sets pixel_sums_u16 to zero. This variable holds eight 16-bit unsigned integer sums. The _mm_load_si128 () call that follows loads 16 pixels into the variable pixel_vals_u8. The next two statements employ the C++ SIMD intrinsic functions _mm_unpacklo_epi8 () and _mm_unpackhi_epi8 () to size-promote the pixel values from 8 bits to 16 bits. Following this size promotion task, the for-loop executes two _mm_add_epi16 () calls to update the intermediate packed sum value pixel_sums_u16 as shown in Figure 2-5. The for-loop repeats this pixel-load and size-promotion summing sequence three more times. The final code block in the for-loop promotes the 16-bit unsigned integer values in pixel_sums_u16 to 32 bits and adds these values to pixel_sums_u32. The for-loop uses the C++ SIMD intrinsic functions _mm_unpacklo_epi16 () and _mm_unpackhi_epi16 () to carry out the size promotions and _mm_add_epi32() to perform the packed 32-bit addition.

Figure 2-5
Pixel size-promotion and summing sequence used in CalcMeanU8_Iavx()

Following execution of the for-loop, four _mm_extract_epi32 () calls are employed to extract each 32-bit unsigned integer intermediate sum value from pixel_sums_u32. After summing these values, the final pixel mean is calculated using scalar floating-point division. Here are the results for source code example Ch02_07:

Results for CalcMeanU8_Cpp

rc0: 1 sum_x0: 1275046509 mean_x0: 127.504651

Results for CalcMeanU8_Iavx

rc1: 1 sum_x1: 1275046509 mean_x1: 127.504651

Running benchmark function CalcMeanU8_bm - please wait

Benchmark times save to file Ch02_07_CalcMeanU8_bm_LITHIUM.csv

Table 2-4 shows some benchmark timing measurements for source code example Ch02_07. While not as large as the previous example, using C++ SIMD intrinsic functions to calculate the arithmetic mean of a pixel array still yielded a considerable increase in performance compared to the standard C++ method.

Table 2-4

Pixel Array Arithmetic Mean Execution Times (Microseconds), 10,000,000 pixels

CPU	CalcMeanU8_Cpp()	CalcMeanU8_Iavx()
Intel Core i7-8700K	2234	462
Intel Core i5-11600K	1856	288

Summary

Table 2-5 summarizes the C++ SIMD intrinsic functions used by the source code examples in this chapter. It also lists suffixes for the various size variants. Before proceeding to the next chapter, you should understand the SIMD arithmetic calculation or data manipulation operation that is performed by each function shown in Table 2-5.

Table 2-5

C++ SIMD Intrinsic Function Summary for Chapter 2

C++ SIMD Function Names	Description
_mm_add_epi8, _epi16, _epi32, _epi64	Packed integer addition
_mm_adds_epi8, _epi16	Packed signed integer addition (saturated)
_mm_adds_epu8, _epu16	Packed unsigned integer addition (saturated)
_mm_and_si128	Bitwise logical AND
_mm_extract_epi8 , _epi16, _epi32, _epi64	Extract integer
_mm_load_si128	Load (aligned) 128-bit wide packed integers
_mm_max_epi8, _epi16, _epi32, _epi64	Packed signed integer maximum
_mm_max_epu8 , _epu16, _epu32, _epu64	Packed unsigned integer maximum
_mm_min_epi8, _epi16, _epi32, _epi64	Packed signed integer minimum
_mm_min_epu8 , _epu16, _epu32, _epu64	Packed unsigned integer minimum
_mm_mul_epi32	Packed 32-bit signed integer multiplication
_mm_mul_epu32	Packed 32-bit unsigned integer multiplication
_mm_mulhi_epi16	Packed 16-bit signed integer multiplication (high result)
_mm_mulhi_epu16	Packed 16-bit unsigned integer multiplication (high result)
_mm_mullo_epi16 , _epi32, _epi64	Packed signed integer multiplication (low result)
_mm_or_si128	Bitwise logical OR
_mm_set1_epi8 , _epi16, _epi32, _epi64	Broadcast integer constant to all elements
_mm_setzero_si128	Set 128-bit wide SIMD operand to all zeros
_mm_slli_epi16 , _epi32, _epi64	Packed integer shift left logical
_mm_slli_si128	128-bit wide shift left logical
_mm_srai_epi16 , _epi32, _epi64	Packed integer shift right arithmetic
_mm_srli_epi16 , _epi32, _epi64	Packed integer shift right logical
_mm_srli_si128	128-bit wide shift right logical
_mm_store_si128	Store (aligned) 128-bit wide packed integers
_mm_sub_epi8, _epi16, _epi32, _epi64	Packed integer subtraction
_mm_subs_epi8, _epi16	Packed signed integer subtraction (saturated)
_mm_subs_epu8, _epu16	Packed unsigned integer subtraction (saturated)
_mm_unpackhi_epi8 , _epi16, _epi32, _epi64	Unpack and interleave high-order integers
_mm_unpacklo_epi8 , _epi16, _epi32, _epi64	Unpacked and interleave low-order integers
_mm_xor_si128	Bitwise logical XOR

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.

Table of Contents for 2. AVX C++ Programming: Part 1

Create new playlist

Sign In

Sign Up

2. AVX C++ Programming: Part 1

Integer Arithmetic

Integer Addition

Integer Subtraction

Integer Multiplication

Integer Bitwise Logical and Shift Operations

Bitwise Logical Operations

Shift Operations

C++ SIMD Intrinsic Function Naming Conventions

Image Processing Algorithms

Pixel Minimum and Maximum

Pixel Mean Intensity

Summary

Table of Contents for
2. AVX C++ Programming: Part 1