© Jo Van Hoey 2019
J. Van HoeyBeginning x64 Assembly Programminghttps://doi.org/10.1007/978-1-4842-5076-1_38

38. Performance Optimization

Jo Van Hoey1 
(1)
Hamme, Belgium
 

You will agree that a lot of the AVX instructions are far from intuitive, especially the different mask layouts that make the code difficult to read and understand. Moreover, the bit masks are sometimes written in hexadecimal notation, so you have to convert them first to binary notation to see what they do.

In this chapter, we will demonstrate that using AVX instructions can dramatically improve performance, and the effort of using AVX pays off in a number of cases. You can find an interesting white paper on benchmarking code at https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf .

In our examples, we will use the measuring method presented in this white paper.

Transpose Computation Performance

In the example code shown in Listing 38-1, we have two methods of computing the transpose matrix, one using “classic” assembler instructions and another using AVX instructions. We added code to measure the execution times of both algorithms.
; transpose.asm
extern printf
section .data
      fmt0   db    "4x4 DOUBLE PRECISION FLOATING POINT MATRIX TRANSPOSE",10,0
      fmt1   db    10,"This is the matrix:",10,0
      fmt2   db    10,"This is the transpose (sequential version): ",10,0
      fmt3   db    10,"This is the transpose (AVX version): ",10,0
      fmt4   db    10,"Number of loops: %d",10,0
      fmt5   db    "Sequential version elapsed cycles: %d",10,0
      fmt6   db    "AVX Shuffle version elapsed cycles: %d",10,0
      align  32
      matrix       dq     1.,     2.,     3.,     4.
                   dq     5.,     6.,     7.,     8.
                   dq     9.,    10.,    11.,    12.
                   dq    13.,    14.,    15.,    16.
      loops  dq    10000
section .bss
      alignb       32
      transpose    resq        16
      bahi_cy      resq   1  ;timers for avx version
      balo_cy      resq   1
      eahi_cy      resq   1
      ealo_cy      resq   1
      bshi_cy      resq   1  ;timers for sequential version
      bslo_cy      resq   1
      eshi_cy      resq   1
      eslo_cy      resq   1
section .text
      global main
main:
push  rbp
mov   rbp,rsp
; print title
      mov   rdi, fmt0
      call  printf
; print matrix
      mov   rdi,fmt1
      call  printf
      mov   rsi,matrix
      call  printm4x4
; SEQUENTIAL VERSION
; compute transpose
      mov   rdi, matrix
      mov   rsi, transpose
      mov   rdx, [loops]
;start measuring the cycles
      cpuid
      rdtsc
      mov   [bshi_cy],edx
      mov   [bslo_cy],eax
      call seq_transpose
;stop measuring the cycles
      rdtscp
      mov   [eshi_cy],edx
      mov   [eslo_cy],eax
      cpuid
;print the result
      mov   rdi,fmt2
      call  printf
      mov   rsi,transpose
      call  printm4x4
; AVX VERSION
; compute transpose
      mov   rdi, matrix
      mov   rsi, transpose
      mov   rdx, [loops]
;start measuring the cycles
      cpuid
      rdtsc
      mov   [bahi_cy],edx
      mov   [balo_cy],eax
      call AVX_transpose
;stop measuring the cycles
      rdtscp
      mov   [eahi_cy],edx
      mov   [ealo_cy],eax
      cpuid
;print the result
      mov   rdi,fmt3
      call  printf
      mov   rsi,transpose
      call  printm4x4
;print the loops
      mov   rdi,fmt4
      mov   rsi,[loops]
      call  printf
;print the cycles
;cycles sequential version
      mov   rdx,[eslo_cy]
      mov   rsi,[eshi_cy]
      shl   rsi,32
      or    rsi,rdx         ;rsi contains end time
      mov   r8,[bslo_cy]
      mov   r9,[bshi_cy]
      shl   r9,32
      or    r9,r8           ;r9 contains start time
      sub   rsi,r9          ;rsi contains elapsed
    ;print the timing result
      mov   rdi,fmt5
      call  printf
;cycles AVX blend version
      mov   rdx,[ealo_cy]
      mov   rsi,[eahi_cy]
      shl   rsi,32
      or    rsi,rdx          ;rsi contains end time
      mov   r8,[balo_cy]
      mov   r9,[bahi_cy]
      shl   r9,32
      or    r9,r8            ;r9 contains start time
      sub   rsi,r9           ;rsi contains elapsed
    ;print the timing result
      mov   rdi,fmt6
      call  printf
leave
ret
;-----------------------------------------------------
seq_transpose:
push  rbp
mov   rbp,rsp
.loopx:                     ; the number of loops
    pxor     xmm0,xmm0
    xor      r10,r10
    xor      rax,rax
    mov      r12,4
    .loopo:
             push  rcx
             mov   r13,4
             .loopi:
                    movsd  xmm0, [rdi+r10]
             movsd  [rsi+rax], xmm0
             add           r10,8
             add           rax,32
             dec           r13
             jnz    .loopi
             add    rax,8
             xor    rax,10000000b    ;rax - 128
             inc    rbx
             dec    r12
      jnz    .loopo
      dec rdx
jnz .loopx
leave
ret
;---------------------------------------------------------------
AVX_transpose:
push  rbp
mov   rbp,rsp
.loopx:                    ; the number of loops
;load matrix into the registers
      vmovapd       ymm0,[rdi]    ;  1   2   3   4
      vmovapd       ymm1,[rdi+32] ;  5   6   7   8
      vmovapd       ymm2,[rdi+64] ;  9  10  11  12
      vmovapd       ymm3,[rdi+96] ; 13  14  15  16
;shuffle
      vshufpd       ymm12,ymm0,ymm1, 0000b     ;  1   5   3   7
      vshufpd       ymm13,ymm0,ymm1, 1111b     ;  2   6   4   8
      vshufpd       ymm14,ymm2,ymm3, 0000b     ;  9  13  11  15
      vshufpd       ymm15,ymm2,ymm3, 1111b     ; 10  14  12  16
;permutate
      vperm2f128    ymm0,ymm12,ymm14,     00100000b    ; 1   5   9  13
      vperm2f128    ymm1,ymm13,ymm15,     00100000b    ; 2   6  10  14
      vperm2f128    ymm2,ymm12,ymm14,     00110001b    ; 3   7  11  15
      vperm2f128    ymm3,ymm13,ymm15,     00110001b    ; 4   8  12  16
;write to memory
      vmovapd       [rsi],   ymm0
      vmovapd       [rsi+32],ymm1
      vmovapd       [rsi+64],ymm2
      vmovapd       [rsi+96],ymm3
      dec rdx
      jnz .loopx
leave
ret
;---------------------------------------------------------------
printm4x4:
section .data
      .fmt   db     "%f",9,"%f",9, "%f",9,"%f",10,0
section .text
push  rbp
mov   rbp,rsp
      push   rbx               ;callee saved
      push   r15         ;callee saved
      mov           rdi,.fmt
      mov           rcx,4
      xor           rbx,rbx     ;row counter
.loop:
      movsd  xmm0, [rsi+rbx]
      movsd  xmm1, [rsi+rbx+8]
      movsd  xmm2, [rsi+rbx+16]
      movsd  xmm3, [rsi+rbx+24]
      mov          rax,4        ; four floats
        push       rcx    ;caller saved
        push       rsi    ;caller saved
        push       rdi    ;caller saved
        ;align stack if needed
        xor  r15,r15
        test rsp,0fh       ;last byte is 8 (not aligned)?
        setnz      r15b            ;set if not aligned
        shl  r15,3              ;multiply by 8
        sub  rsp,r15      ;substract 0 or 8
             call   printf
        add  rsp,r15        ;add 0 or 8
        pop  rdi
        pop  rsi
        pop  rcx
        add  rbx,32      ;next row
        loop .loop
pop r15
pop rbx
leave
ret
Listing 38-1

transpose.asm

Before we call the transpose function, we start the timing process. Modern processors support out-of-order execution code, which could result in instructions being executed at the wrong moment, before we start the timing or after we stop the timing. To avoid that, we need to use “serializing” instructions, which are instructions that guarantee that our timing instructions measure only what we want to measure. See the previous white paper for a more detailed explanation. One such instruction that can be used for serializing is cpuid. Before starting the timer with rdtsc , we execute cpuid. We use rdtsc to write the beginning timestamp counter “low cycles” in register eax and “high cycles” in edx; these values are stored in memory. The instruction rdtsc uses these two registers for historical reasons: in 32-bit processors, one register would be too small to hold the timer counts. One 32-bit register is used for the lower part of the timer counter value, and another register is used for the higher part. After recording the beginning timer counter values, we execute the code we want to measure and use the rdtscp instruction to stop the measurement. The ending “high cycles” and “low cycles” counters are stored again in memory, and cpuid is executed once again to make sure that no execution of instructions is postponed by the processor.

We use a 64-bit processor environment, so we shift left 32 the higher timestamp values and then xor the higher timestamp value with the lower timestamp value to obtain the complete timestamps in a 64-bit register. The difference between the beginning counter values and the ending counter values gives the number of cycles used.

The function seq_transpose uses “classic” instructions, and the function AVX_transpose is the transpose_shuffle4x4 function from the previous chapter. The functions are executed a large number of times as specified in the variable loops.

Figure 38-1 shows the output.
../images/483996_1_En_38_Chapter/483996_1_En_38_Fig1_HTML.jpg
Figure 38-1

transpose.asm output

You can see that using AVX instructions spectacularly speeds up the processing.

Intel has a volume dedicated to code optimization: https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf .

This manual has a lot of interesting information on improving the performance of assembly code. Search for handling port 5 pressure (currently covered in Chapter 14). In that section, you will find several versions of a transpose algorithm for 8×8 matrices as well as the performance impact of different instructions. In the previous chapter, we demonstrated two ways of transposing a matrix, using unpacking and using shuffle. The Intel manuals go much deeper into the details of this subject; if performance is important to you, there are treasures to be found there.

Trace Computation Performance

Here is an example showing that AVX instructions are not always faster than “classic” assembly instructions. This example computes the trace of an 8×8 matrix:
; trace.asm
extern printf
section .data
      fmt0   db      "8x8 SINGLE PRECISION FLOATING POINT MATRIX TRACE",10,0
      fmt1   db      10,"This is the matrix:",10,0
      fmt2   db      10,"This is the trace (sequential version): %f",10,0
      fmt5   db      "This is the trace (AVX blend version): %f",10,0
      fmt6   db      10,"This is the tranpose: ",10,0
      fmt30  db      "Sequential version elapsed cycles: %u",10,0
      fmt31  db      "AVX blend  version elapsed cycles: %d",10,10,0
      fmt4   db      10,"Number of loops: %d",10,0
      align  32
      matrix dd 1.,     2.,     3.,     4.,     5.,     6.,     7.,     8.
             dd 9.,     10.,    11.,    12.,    13.,    14.,    15.,    16.
             dd 17.,    18.,    19.,    20.,    21.,    22.,    23.,    24.
             dd 25.,    26.,    27.,    28.,    29.,    30.,    31.,    32.
             dd 33.,    34.,    35.,    36.,    37.,    38.,    39.,    40.
             dd 41.,    42.,    43.,    44.,    45.,    46.,    47.,    48.
             dd 49.,    50.,    51.,    52.,    53.,    54.,    55.,    56.
             dd 57.,    58.,    59.,    60.,    61.,    62.,    63.,    64.
      loops  dq    1000
      permps dd    0,1,4,5,2,3,6,7  ;mask for permutation sp values in ymm
section .bss
      alignb       32
      transpose    resq   16
      trace        resq   1
      bbhi_cy      resq   1
      bblo_cy      resq   1
      ebhi_cy      resq   1
      eblo_cy      resq   1
      bshi_cy      resq   1
      bslo_cy      resq   1
      eshi_cy      resq   1
      eslo_cy      resq   1
section .text
      global main
main:
push  rbp
mov   rbp,rsp
; print title
      mov    rdi, fmt0
      call   printf
; print matrix
      mov    rdi,fmt1
      call   printf
      mov    rsi,matrix
      call   printm8x8
; SEQUENTIAL VERSION
; compute trace
      mov    rdi, matrix
      mov    rsi, [loops]
;start measuring the cycles
      cpuid
      rdtsc
      mov    [bshi_cy],edx
      mov    [bslo_cy],eax
      call   seq_trace
;stop measuring the cycles
      rdtscp
      mov    [eshi_cy],edx
      mov    [eslo_cy],eax
      cpuid
;print the result
      mov    rdi, fmt2
      mov    rax,1
      call   printf
; BLEND VERSION
; compute trace
      mov    rdi, matrix
      mov    rsi, [loops]
;start measuring the cycles
      cpuid
      rdtsc
      mov    [bbhi_cy],edx
      mov    [bblo_cy],eax
      call   blend_trace
;stop measuring the cycles
      rdtscp
      mov    [ebhi_cy],edx
      mov    [eblo_cy],eax
      cpuid
;print the result
      mov    rdi, fmt5
      mov    rax,1
      call   printf
;print the loops
      mov    rdi,fmt4
      mov    rsi,[loops]
      call   printf
;print the cycles
;cycles sequential version
      mov    rdx,[eslo_cy]
      mov    rsi,[eshi_cy]
      shl    rsi,32
      or     rsi,rdx
      mov    r8,[bslo_cy]
      mov    r9,[bshi_cy]
      shl    r9,32
      or     r9,r8
      sub    rsi,r9        ;rsi contains elapsed
    ;print
      mov    rdi,fmt30
      call printf
;cycles AVX blend version
      mov   rdx,[eblo_cy]
      mov   rsi,[ebhi_cy]
      shl   rsi,32
      or    rsi,rdx
      mov   r8,[bblo_cy]
      mov   r9,[bbhi_cy]
      shl   r9,32
      or    r9,r8
      sub   rsi,r9
    ;print
      mov   rdi,fmt31
      call  printf
leave
ret
;--------------------------------------------------------------
seq_trace:
push  rbp
mov   rbp,rsp
.loop0:
      pxor  xmm0,xmm0
      mov   rcx,8
      xor   rax,rax
      xor   rbx,rbx
      .loop:
      addss xmm0, [rdi+rax]
      add   rax,36   ;each row 32 bytes
      loop  .loop
      cvtss2sd     xmm0,xmm0
      dec          rsi
      jnz          .loop0
leave
ret
;--------------------------------------------------------------
blend_trace:
push   rbp
mov     rbp,rsp
.loop:
    ;build the matrix in memory
      vmovaps       ymm0, [rdi]
      vmovaps       ymm1, [rdi+32]
      vmovaps       ymm2, [rdi+64]
      vmovaps       ymm3, [rdi+96]
      vmovaps       ymm4, [rdi+128]
      vmovaps       ymm5, [rdi+160]
      vmovaps       ymm6, [rdi+192]
      vmovaps       ymm7, [rdi+224]
      vblendps      ymm0,ymm0,ymm1,00000010b
      vblendps      ymm0,ymm0,ymm2,00000100b
      vblendps      ymm0,ymm0,ymm3,00001000b
      vblendps      ymm0,ymm0,ymm4,00010000b
      vblendps      ymm0,ymm0,ymm5,00100000b
      vblendps      ymm0,ymm0,ymm6,01000000b
      vblendps      ymm0,ymm0,ymm7,10000000b
      vhaddps       ymm0,ymm0,ymm0
      vmovdqu       ymm1,[permps]
      vpermps       ymm0,ymm1,ymm0
      haddps        xmm0,xmm0
      vextractps    r8d,xmm0,0
      vextractps    r9d,xmm0,1
      vmovd         xmm0,r8d
      vmovd         xmm1,r9d
      vaddss        xmm0,xmm0,xmm1
      dec           rsi
      jnz           .loop
cvtss2sd xmm0,xmm0
leave
ret
printm8x8:
section .data
.fmt db      "%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f,",9,"%.f",10,0
section .text
push  rbp
mov   rbp,rsp
      push   rbx            ;callee saved
      mov    rdi,.fmt
      mov    rcx,8
      xor    rbx,rbx    ;row counter
      vzeroall
.loop:
      movss         xmm0, dword[rsi+rbx]
        cvtss2sd    xmm0,xmm0
      movss         xmm1, [rsi+rbx+4]
        cvtss2sd    xmm1,xmm1
      movss         xmm2, [rsi+rbx+8]
        cvtss2sd    xmm2,xmm2
      movss         xmm3, [rsi+rbx+12]
        cvtss2sd    xmm3,xmm3
      movss         xmm4, [rsi+rbx+16]
        cvtss2sd    xmm4,xmm4
      movss         xmm5, [rsi+rbx+20]
        cvtss2sd    xmm5,xmm5
      movss         xmm6, [rsi+rbx+24]
        cvtss2sd    xmm6,xmm6
      movss         xmm7, [rsi+rbx+28]
        cvtss2sd    xmm7,xmm7
      mov     rax,8 ; 8 floats
      push    rcx        ;caller saved
      push    rsi        ;caller saved
      push    rdi        ;caller saved
        ;align stack if needed
      xor     r15,r15
      test    rsp,0fh            ;last byte is 8 (not aligned)?
      setnz   r15b               ;set if not aligned
      shl     r15,3          ;multiply by 8
      sub     rsp,r15        ;substract 0 or 8
      call    printf
      add     rsp,r15        ;add 0 or 8
      pop     rdi
      pop     rsi
      pop     rcx
      add     rbx,32     ;next row
      loop    .loop
pop rbx     ;callee saved
leave
ret

The function blend_trace is an extension from 4×4 to 8×8 of the trace function we used in Chapter 36, in our matrix inversion code, with AVX instructions. The function seq_trace walks sequentially through the matrix, finds the trace elements, and adds them. When running this code, you will see that seq_trace is much faster than blend_trace.

Figure 38-2 shows the output.
../images/483996_1_En_38_Chapter/483996_1_En_38_Fig2_HTML.jpg
Figure 38-2

trace.asm output

If you want to know more about optimization, use the previously mentioned Intel manual. Here is another excellent source: https://www.agner.org .

Summary

In this chapter, you learned about the following:
  • Measuring and computing elapsed cycles

  • That AVX can speed up processing drastically

  • That AVX is not suited for every situation

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.147.72.74