Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

J. Van HoeyBeginning x64 Assembly Programminghttps://doi.org/10.1007/978-1-4842-5076-1_38

38. Performance Optimization

Jo Van Hoey¹

(1)

Hamme, Belgium

You will agree that a lot of the AVX instructions are far from intuitive, especially the different mask layouts that make the code difficult to read and understand. Moreover, the bit masks are sometimes written in hexadecimal notation, so you have to convert them first to binary notation to see what they do.

In this chapter, we will demonstrate that using AVX instructions can dramatically improve performance, and the effort of using AVX pays off in a number of cases. You can find an interesting white paper on benchmarking code at https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf .

In our examples, we will use the measuring method presented in this white paper.

Transpose Computation Performance

In the example code shown in Listing 38-1, we have two methods of computing the transpose matrix, one using “classic” assembler instructions and another using AVX instructions. We added code to measure the execution times of both algorithms.

; transpose.asm

extern printf

section .data

fmt0 db "4x4 DOUBLE PRECISION FLOATING POINT MATRIX TRANSPOSE",10,0

fmt1 db 10,"This is the matrix:",10,0

fmt2 db 10,"This is the transpose (sequential version): ",10,0

fmt3 db 10,"This is the transpose (AVX version): ",10,0

fmt4 db 10,"Number of loops: %d",10,0

fmt5 db "Sequential version elapsed cycles: %d",10,0

fmt6 db "AVX Shuffle version elapsed cycles: %d",10,0

align 32

matrix dq 1., 2., 3., 4.

dq 5., 6., 7., 8.

dq 9., 10., 11., 12.

dq 13., 14., 15., 16.

loops dq 10000

section .bss

alignb 32

transpose resq 16

bahi_cy resq 1 ;timers for avx version

balo_cy resq 1

eahi_cy resq 1

ealo_cy resq 1

bshi_cy resq 1 ;timers for sequential version

bslo_cy resq 1

eshi_cy resq 1

eslo_cy resq 1

section .text

global main

main:

push rbp

mov rbp,rsp

; print title

mov rdi, fmt0

call printf

; print matrix

mov rdi,fmt1

call printf

mov rsi,matrix

call printm4x4

; SEQUENTIAL VERSION

; compute transpose

mov rdi, matrix

mov rsi, transpose

mov rdx, [loops]

;start measuring the cycles

cpuid

rdtsc

mov [bshi_cy],edx

mov [bslo_cy],eax

call seq_transpose

;stop measuring the cycles

rdtscp

mov [eshi_cy],edx

mov [eslo_cy],eax

cpuid

;print the result

mov rdi,fmt2

call printf

mov rsi,transpose

call printm4x4

; AVX VERSION

; compute transpose

mov rdi, matrix

mov rsi, transpose

mov rdx, [loops]

;start measuring the cycles

cpuid

rdtsc

mov [bahi_cy],edx

mov [balo_cy],eax

call AVX_transpose

;stop measuring the cycles

rdtscp

mov [eahi_cy],edx

mov [ealo_cy],eax

cpuid

;print the result

mov rdi,fmt3

call printf

mov rsi,transpose

call printm4x4

;print the loops

mov rdi,fmt4

mov rsi,[loops]

call printf

;print the cycles

;cycles sequential version

mov rdx,[eslo_cy]

mov rsi,[eshi_cy]

shl rsi,32

or rsi,rdx ;rsi contains end time

mov r8,[bslo_cy]

mov r9,[bshi_cy]

shl r9,32

or r9,r8 ;r9 contains start time

sub rsi,r9 ;rsi contains elapsed

;print the timing result

mov rdi,fmt5

call printf

;cycles AVX blend version

mov rdx,[ealo_cy]

mov rsi,[eahi_cy]

shl rsi,32

or rsi,rdx ;rsi contains end time

mov r8,[balo_cy]

mov r9,[bahi_cy]

shl r9,32

or r9,r8 ;r9 contains start time

sub rsi,r9 ;rsi contains elapsed

;print the timing result

mov rdi,fmt6

call printf

leave

ret

;-----------------------------------------------------

seq_transpose:

push rbp

mov rbp,rsp

.loopx: ; the number of loops

pxor xmm0,xmm0

xor r10,r10

xor rax,rax

mov r12,4

.loopo:

push rcx

mov r13,4

.loopi:

movsd xmm0, [rdi+r10]

movsd [rsi+rax], xmm0

add r10,8

add rax,32

dec r13

jnz .loopi

add rax,8

xor rax,10000000b ;rax - 128

inc rbx

dec r12

jnz .loopo

dec rdx

jnz .loopx

leave

ret

;---------------------------------------------------------------

AVX_transpose:

push rbp

mov rbp,rsp

.loopx: ; the number of loops

;load matrix into the registers

vmovapd ymm0,[rdi] ; 1 2 3 4

vmovapd ymm1,[rdi+32] ; 5 6 7 8

vmovapd ymm2,[rdi+64] ; 9 10 11 12

vmovapd ymm3,[rdi+96] ; 13 14 15 16

;shuffle

vshufpd ymm12,ymm0,ymm1, 0000b ; 1 5 3 7

vshufpd ymm13,ymm0,ymm1, 1111b ; 2 6 4 8

vshufpd ymm14,ymm2,ymm3, 0000b ; 9 13 11 15

vshufpd ymm15,ymm2,ymm3, 1111b ; 10 14 12 16

;permutate

vperm2f128 ymm0,ymm12,ymm14, 00100000b ; 1 5 9 13

vperm2f128 ymm1,ymm13,ymm15, 00100000b ; 2 6 10 14

vperm2f128 ymm2,ymm12,ymm14, 00110001b ; 3 7 11 15

vperm2f128 ymm3,ymm13,ymm15, 00110001b ; 4 8 12 16

;write to memory

vmovapd [rsi], ymm0

vmovapd [rsi+32],ymm1

vmovapd [rsi+64],ymm2

vmovapd [rsi+96],ymm3

dec rdx

jnz .loopx

leave

ret

;---------------------------------------------------------------

printm4x4:

section .data

.fmt db "%f",9,"%f",9, "%f",9,"%f",10,0

section .text

push rbp

mov rbp,rsp

push rbx ;callee saved

push r15 ;callee saved

mov rdi,.fmt

mov rcx,4

xor rbx,rbx ;row counter

.loop:

movsd xmm0, [rsi+rbx]

movsd xmm1, [rsi+rbx+8]

movsd xmm2, [rsi+rbx+16]

movsd xmm3, [rsi+rbx+24]

mov rax,4 ; four floats

push rcx ;caller saved

push rsi ;caller saved

push rdi ;caller saved

;align stack if needed

xor r15,r15

test rsp,0fh ;last byte is 8 (not aligned)?

setnz r15b ;set if not aligned

shl r15,3 ;multiply by 8

sub rsp,r15 ;substract 0 or 8

call printf

add rsp,r15 ;add 0 or 8

pop rdi

pop rsi

pop rcx

add rbx,32 ;next row

loop .loop

pop r15

pop rbx

leave

ret

Listing 38-1

transpose.asm

Before we call the transpose function, we start the timing process. Modern processors support out-of-order execution code, which could result in instructions being executed at the wrong moment, before we start the timing or after we stop the timing. To avoid that, we need to use “serializing” instructions, which are instructions that guarantee that our timing instructions measure only what we want to measure. See the previous white paper for a more detailed explanation. One such instruction that can be used for serializing is cpuid. Before starting the timer with rdtsc , we execute cpuid. We use rdtsc to write the beginning timestamp counter “low cycles” in register eax and “high cycles” in edx; these values are stored in memory. The instruction rdtsc uses these two registers for historical reasons: in 32-bit processors, one register would be too small to hold the timer counts. One 32-bit register is used for the lower part of the timer counter value, and another register is used for the higher part. After recording the beginning timer counter values, we execute the code we want to measure and use the rdtscp instruction to stop the measurement. The ending “high cycles” and “low cycles” counters are stored again in memory, and cpuid is executed once again to make sure that no execution of instructions is postponed by the processor.

We use a 64-bit processor environment, so we shift left 32 the higher timestamp values and then xor the higher timestamp value with the lower timestamp value to obtain the complete timestamps in a 64-bit register. The difference between the beginning counter values and the ending counter values gives the number of cycles used.

The function seq_transpose uses “classic” instructions, and the function AVX_transpose is the transpose_shuffle4x4 function from the previous chapter. The functions are executed a large number of times as specified in the variable loops.

Figure 38-1 shows the output.

../images/483996_1_En_38_Chapter/483996_1_En_38_Fig1_HTML.jpg — Figure 38-1
transpose.asm output

You can see that using AVX instructions spectacularly speeds up the processing.

Intel has a volume dedicated to code optimization: https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf .

This manual has a lot of interesting information on improving the performance of assembly code. Search for handling port 5 pressure (currently covered in Chapter 14). In that section, you will find several versions of a transpose algorithm for 8×8 matrices as well as the performance impact of different instructions. In the previous chapter, we demonstrated two ways of transposing a matrix, using unpacking and using shuffle. The Intel manuals go much deeper into the details of this subject; if performance is important to you, there are treasures to be found there.

Trace Computation Performance

Here is an example showing that AVX instructions are not always faster than “classic” assembly instructions. This example computes the trace of an 8×8 matrix:

; trace.asm

extern printf

section .data

fmt0 db "8x8 SINGLE PRECISION FLOATING POINT MATRIX TRACE",10,0

fmt1 db 10,"This is the matrix:",10,0

fmt2 db 10,"This is the trace (sequential version): %f",10,0

fmt5 db "This is the trace (AVX blend version): %f",10,0

fmt6 db 10,"This is the tranpose: ",10,0

fmt30 db "Sequential version elapsed cycles: %u",10,0

fmt31 db "AVX blend version elapsed cycles: %d",10,10,0

fmt4 db 10,"Number of loops: %d",10,0

align 32

matrix dd 1., 2., 3., 4., 5., 6., 7., 8.

dd 9., 10., 11., 12., 13., 14., 15., 16.

dd 17., 18., 19., 20., 21., 22., 23., 24.

dd 25., 26., 27., 28., 29., 30., 31., 32.

dd 33., 34., 35., 36., 37., 38., 39., 40.

dd 41., 42., 43., 44., 45., 46., 47., 48.

dd 49., 50., 51., 52., 53., 54., 55., 56.

dd 57., 58., 59., 60., 61., 62., 63., 64.

loops dq 1000

permps dd 0,1,4,5,2,3,6,7 ;mask for permutation sp values in ymm

section .bss

alignb 32

transpose resq 16

trace resq 1

bbhi_cy resq 1

bblo_cy resq 1

ebhi_cy resq 1

eblo_cy resq 1

bshi_cy resq 1

bslo_cy resq 1

eshi_cy resq 1

eslo_cy resq 1

section .text

global main

main:

push rbp

mov rbp,rsp

; print title

mov rdi, fmt0

call printf

; print matrix

mov rdi,fmt1

call printf

mov rsi,matrix

call printm8x8

; SEQUENTIAL VERSION

; compute trace

mov rdi, matrix

mov rsi, [loops]

;start measuring the cycles

cpuid

rdtsc

mov [bshi_cy],edx

mov [bslo_cy],eax

call seq_trace

;stop measuring the cycles

rdtscp

mov [eshi_cy],edx

mov [eslo_cy],eax

cpuid

;print the result

mov rdi, fmt2

mov rax,1

call printf

; BLEND VERSION

; compute trace

mov rdi, matrix

mov rsi, [loops]

;start measuring the cycles

cpuid

rdtsc

mov [bbhi_cy],edx

mov [bblo_cy],eax

call blend_trace

;stop measuring the cycles

rdtscp

mov [ebhi_cy],edx

mov [eblo_cy],eax

cpuid

;print the result

mov rdi, fmt5

mov rax,1

call printf

;print the loops

mov rdi,fmt4

mov rsi,[loops]

call printf

;print the cycles

;cycles sequential version

mov rdx,[eslo_cy]

mov rsi,[eshi_cy]

shl rsi,32

or rsi,rdx

mov r8,[bslo_cy]

mov r9,[bshi_cy]

shl r9,32

or r9,r8

sub rsi,r9 ;rsi contains elapsed

;print

mov rdi,fmt30

call printf

;cycles AVX blend version

mov rdx,[eblo_cy]

mov rsi,[ebhi_cy]

shl rsi,32

or rsi,rdx

mov r8,[bblo_cy]

mov r9,[bbhi_cy]

shl r9,32

or r9,r8

sub rsi,r9

;print

mov rdi,fmt31

call printf

leave

ret

;--------------------------------------------------------------

seq_trace:

push rbp

mov rbp,rsp

.loop0:

pxor xmm0,xmm0

mov rcx,8

xor rax,rax

xor rbx,rbx

.loop:

addss xmm0, [rdi+rax]

add rax,36 ;each row 32 bytes

loop .loop

cvtss2sd xmm0,xmm0

dec rsi

jnz .loop0

leave

ret

;--------------------------------------------------------------

blend_trace:

push rbp

mov rbp,rsp

.loop:

;build the matrix in memory

vmovaps ymm0, [rdi]

vmovaps ymm1, [rdi+32]

vmovaps ymm2, [rdi+64]

vmovaps ymm3, [rdi+96]

vmovaps ymm4, [rdi+128]

vmovaps ymm5, [rdi+160]

vmovaps ymm6, [rdi+192]

vmovaps ymm7, [rdi+224]

vblendps ymm0,ymm0,ymm1,00000010b

vblendps ymm0,ymm0,ymm2,00000100b

vblendps ymm0,ymm0,ymm3,00001000b

vblendps ymm0,ymm0,ymm4,00010000b

vblendps ymm0,ymm0,ymm5,00100000b

vblendps ymm0,ymm0,ymm6,01000000b

vblendps ymm0,ymm0,ymm7,10000000b

vhaddps ymm0,ymm0,ymm0

vmovdqu ymm1,[permps]

vpermps ymm0,ymm1,ymm0

haddps xmm0,xmm0

vextractps r8d,xmm0,0

vextractps r9d,xmm0,1

vmovd xmm0,r8d

vmovd xmm1,r9d

vaddss xmm0,xmm0,xmm1

dec rsi

jnz .loop

cvtss2sd xmm0,xmm0

leave

ret

printm8x8:

section .data

.fmt db "%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f",10,0

section .text

push rbp

mov rbp,rsp

push rbx ;callee saved

mov rdi,.fmt

mov rcx,8

xor rbx,rbx ;row counter

vzeroall

.loop:

movss xmm0, dword[rsi+rbx]

cvtss2sd xmm0,xmm0

movss xmm1, [rsi+rbx+4]

cvtss2sd xmm1,xmm1

movss xmm2, [rsi+rbx+8]

cvtss2sd xmm2,xmm2

movss xmm3, [rsi+rbx+12]

cvtss2sd xmm3,xmm3

movss xmm4, [rsi+rbx+16]

cvtss2sd xmm4,xmm4

movss xmm5, [rsi+rbx+20]

cvtss2sd xmm5,xmm5

movss xmm6, [rsi+rbx+24]

cvtss2sd xmm6,xmm6

movss xmm7, [rsi+rbx+28]

cvtss2sd xmm7,xmm7

mov rax,8 ; 8 floats

push rcx ;caller saved

push rsi ;caller saved

push rdi ;caller saved

;align stack if needed

xor r15,r15

test rsp,0fh ;last byte is 8 (not aligned)?

setnz r15b ;set if not aligned

shl r15,3 ;multiply by 8

sub rsp,r15 ;substract 0 or 8

call printf

add rsp,r15 ;add 0 or 8

pop rdi

pop rsi

pop rcx

add rbx,32 ;next row

loop .loop

pop rbx ;callee saved

leave

ret

The function blend_trace is an extension from 4×4 to 8×8 of the trace function we used in Chapter 36, in our matrix inversion code, with AVX instructions. The function seq_trace walks sequentially through the matrix, finds the trace elements, and adds them. When running this code, you will see that seq_trace is much faster than blend_trace.

Figure 38-2 shows the output.

../images/483996_1_En_38_Chapter/483996_1_En_38_Fig2_HTML.jpg — Figure 38-2
trace.asm output

If you want to know more about optimization, use the previously mentioned Intel manual. Here is another excellent source: https://www.agner.org .

Summary

In this chapter, you learned about the following:

Measuring and computing elapsed cycles
That AVX can speed up processing drastically
That AVX is not suited for every situation

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.

Table of Contents for 38. Performance Optimization

Create new playlist

Sign In

Sign Up

38. Performance Optimization

Transpose Computation Performance

Trace Computation Performance

Summary

Table of Contents for
38. Performance Optimization