© Jo Van Hoey 2019
J. Van HoeyBeginning x64 Assembly Programminghttps://doi.org/10.1007/978-1-4842-5076-1_36

36. AVX Matrix Operations

Jo Van Hoey1 
(1)
Hamme, Belgium
 

Instead of summing up a number of possibly interesting AVX instructions, let’s look at some matrix operations using AVX. This is a long chapter with several pages of code; a lot will be familiar, but we will introduce several new instructions here.

We will show matrix multiplication and matrix inversion. In the next chapter, we will show how to transpose a matrix.

Example Matrix Code

Listing 36-1 shows the example code.
; matrix4x4.asm
extern printf
section .data
      fmt0   db    10,"4x4 DOUBLE PRECISION FLOATING POINT MATRICES",10,0
      fmt1   db    10,"This is matrixA:",10,0
      fmt2   db    10,"This is matrixB:",10,0
      fmt3   db    10,"This is matrixA x matrixB:",10,0
      fmt4   db    10,"This is matrixC:",10,0
      fmt5   db    10,"This is the inverse of matrixC:",10,0
      fmt6   db    10,"Proof: matrixC x inverse =",10,0
      fmt7   db    10,"This is matrixS:",10,0
      fmt8   db    10,"This is the inverse of matrixS:",10,0
      fmt9   db    10,"Proof: matrixS x inverse =",10,0
      fmt10  db    10,"This matrix is singular!",10,10,0
      align 32
        matrixA     dq     1.,  3.,  5.,  7.
                    dq     9., 11., 13., 15.
                    dq    17., 19., 21., 23.
                    dq    25., 27., 29., 31.
        matrixB     dq     2.,  4.,  6.,  8.
                    dq    10., 12., 14., 16.
                    dq    18., 20., 22., 24.
                    dq    26., 28., 30., 32.
        matrixC     dq     2.,          11.,        21.,      37.
                    dq     3.,          13.,        23.,      41.
                    dq     5.,          17.,        29.,      43.
                    dq     7.,          19.,        31.,      47.
        matrixS     dq      1.,      2.,      3.,         4.
                    dq      5.,      6.,      7.,         8.
                    dq      9.,     10.,     11.,        12.
                    dq     13.,     14.,     15.,        16.
section .bss
      alignb 32
      product resq 16
      inverse resq 16
section .text
      global main
main:
push  rbp
mov   rbp,rsp
; print title
      mov    rdi, fmt0
      call   printf
; print matrixA
      mov    rdi,fmt1
      call   printf
      mov    rsi,matrixA
      call   printm4x4
; print matrixB
      mov    rdi,fmt2
      call   printf
      mov    rsi,matrixB
      call   printm4x4
; compute the product matrixA x matrixB
      mov    rdi,matrixA
      mov    rsi,matrixB
      mov    rdx,product
      call   multi4x4
; print the product
      mov    rdi,fmt3
      call   printf
      mov    rsi,product
      call   printm4x4
; print matrixC
      mov    rdi,fmt4
      call   printf
      mov    rsi,matrixC
      call   printm4x4
; compute the inverse of matrixC
      mov    rdi,matrixC
      mov    rsi,inverse
      call   inverse4x4
      cmp    rax,1
      je     singular
; print the inverse
      mov    rdi,fmt5
      call   printf
      mov    rsi,inverse
      call   printm4x4
; proof multiply matrixC and inverse
      mov    rsi,matrixC
      mov    rdi,inverse
      mov    rdx,product
      call   multi4x4
; print the proof
      mov    rdi,fmt6
      call   printf
      mov    rsi,product
      call   printm4x4
; Singular matrix
; print matrixS
      mov    rdi,fmt7
      call   printf
      mov    rsi,matrixS
      call   printm4x4
; compute the inverse of matrixS
      mov    rdi,matrixS
      mov    rsi,inverse
      call   inverse4x4
      cmp    rax,1
      je     singular
; print the inverse
      mov    rdi,fmt8
      call   printf
      mov    rsi,inverse
      call   printm4x4
; proof multiply matrixS and inverse
      mov    rsi,matrixS
      mov    rdi,inverse
      mov    rdx,product
      call   multi4x4
; print the proof
      mov    rdi,fmt9
      call   printf
      mov    rsi,product
      call   printm4x4
      jmp    exit
singular:
; print error
      mov    rdi,fmt10
      call printf
exit:
leave
ret
inverse4x4:
section .data
      align 32
      .identity    dq       1., 0., 0., 0.
                   dq       0., 1., 0., 0.
                   dq       0., 0., 1., 0.
                   dq       0., 0., 0., 1.
      .minus_mask  dq      8000000000000000h
      .size        dq      4                 ;4 x 4 matrices
      .one         dq      1.0
      .two         dq      2.0
      .three       dq      3.0
      .four        dq      4.0
section .bss
      alignb 32
      .matrix1 resq 16          ;intermediate matrix
      .matrix2 resq 16          ;intermediate matrix
      .matrix3 resq 16          ;intermediate matrix
      .matrix4 resq 16          ;intermediate matrix
      .matrixI resq 16
      .mxcsr resd 1      ;used for checking zero division
section .text
push  rbp
mov   rbp,rsp
      push   rsi         ;save address of inverse matrix
      vzeroall           ;clear all ymm registers
; compute the intermediate matrices
; compute the intermediate matrix2
; rdi contains address of the original matrix
      mov    rsi,rdi
      mov    rdx,.matrix2
      push   rdi
      call   multi4x4
      pop    rdi
; compute the intermediate matrix3
      mov    rsi,.matrix2
      mov    rdx,.matrix3
      push   rdi
      call   multi4x4
      pop    rdi
; compute the intermediate matrix4
      mov    rsi,.matrix3
      mov    rdx,.matrix4
      push   rdi
      call   multi4x4
      pop    rdi
;compute the traces
;compute trace1
      mov    rsi,[.size]
      call   vtrace
      movsd  xmm8,xmm0    ;trace 1 in xmm8
;compute trace2
      push   rdi            ; save address of the original matrix
      mov    rdi,.matrix2
      mov    rsi,[.size]
      call   vtrace
      movsd xmm9,xmm0   ;trace 2 in xmm9
;compute trace3
      mov    rdi,.matrix3
      mov    rsi,[.size]
      call   vtrace
      movsd  xmm10,xmm0   ;trace 3 in xmm10
;compute trace4
      mov    rdi,.matrix4
      mov    rsi,[.size]
      call   vtrace
      movsd  xmm11,xmm0   ;trace 4 in xmm11
; compute the coefficients
; compute coefficient p1
; p1 = -s1
      vxorpd        xmm12,xmm8,[.minus_mask] ;p1 in xmm12
; compute coefficient p2
; p2 = -1/2 * (p1 * s1 + s2)
      movsd         xmm13,xmm12   ;copy p1 to xmm13
      vfmadd213sd   xmm13,xmm8,xmm9 ;xmm13=xmm13*xmm8+xmm9
      vxorpd        xmm13,xmm13,[.minus_mask]
      divsd         xmm13,[.two] ;divide by 2 and p2 in xmm13
; compute coefficient p3
; p3 = -1/3 * (p2 * s1 + p1 * s2 + s3)
      movsd         xmm14,xmm12               ;copy p1 to xmm14
      vfmadd213sd   xmm14,xmm9,xmm10 ;p1*s2+s3;xmm14=xmm14*xmm9+xmm10
      vfmadd231sd   xmm14,xmm13,xmm8 ;xmm14+p2*s1;xmm14=xmm14+xmm13*xmm8
      vxorpd        xmm14,xmm14,[.minus_mask]
      divsd         xmm14,[.three]             ;p3 in xmm14
; compute coefficient p4
; p4 = -1/4 * (p3 * s1 + p2 * s2 + p1 * s3 + s4)
      movsd         xmm15,xmm12   ;copy p1 to xmm15
      vfmadd213sd   xmm15,xmm10,xmm11 ;p1*s3+s4;xmm15=xmm15*xmm10+xmm11
      vfmadd231sd   xmm15,xmm13,xmm9 ;xmm15+p2*s2;xmm15=xmm15+xmm13*xmm9
      vfmadd231sd   xmm15,xmm14,xmm8 ;xmm15+p3*s1;xmm15=xmm15+xmm14*xmm8
      vxorpd        xmm15,xmm15,[.minus_mask]
      divsd         xmm15,[.four]    ;p4 in xmm15
;multiply matrices with proper coefficient
      mov          rcx,[.size]
      xor          rax,rax
      vbroadcastsd       ymm1,xmm12 ; p1
      vbroadcastsd       ymm2,xmm13 ; p2
      vbroadcastsd       ymm3,xmm14 ; p3
      pop rdi     ; restore the address of the original matrix
.loop1:
      vmovapd            ymm0,[rdi+rax]
      vmulpd             ymm0,ymm0,ymm2
      vmovapd            [.matrix1+rax],ymm0
      vmovapd            ymm0,[.matrix2+rax]
      vmulpd             ymm0,ymm0,ymm1
      vmovapd            [.matrix2+rax],ymm0
      vmovapd            ymm0,[.identity+rax]
      vmulpd             ymm0,ymm0,ymm3
      vmovapd            [.matrixI+rax],ymm0
      add                rax,32
      loop               .loop1
;add the four matrices and multiply by -1/p4
      mov           rcx,[.size]
      xor           rax,rax
 ;compute -1/p4
      movsd         xmm0, [.one]
      vdivsd        xmm0,xmm0,xmm15    ;1/p4
 ;check for zero division
      stmxcsr       [.mxcsr]
      and          dword[.mxcsr],4
      jnz           .singular
; no zero division
      pop          rsi         ;recall address of inverse matrix
      vxorpd       xmm0,xmm0,[.minus_mask]  ;-1/p4
      vbroadcastsd ymm2,xmm0
 ;loop through the rows
.loop2:
      ;add the rows
      vmovapd      ymm0,[.matrix1+rax]
      vaddpd       ymm0, ymm0, [.matrix2+rax]
      vaddpd       ymm0, ymm0, [.matrix3+rax]
      vaddpd       ymm0, ymm0, [.matrixI+rax]
      vmulpd       ymm0,ymm0,ymm2            ;multiply the row with -1/p4
      vmovapd      [rsi+rax],ymm0
      add          rax,32
      loop         .loop2
      xor          rax,rax     ;return 0, no error
leave
ret
.singular:
      mov          rax,1       ;return 1, singular matrix
leave
ret
;------------------------------------------------------
; trace computation
vtrace:
push  rbp
mov   rbp,rsp
;build the matrix in memory
      vmovapd      ymm0, [rdi]
      vmovapd      ymm1, [rdi+32]
      vmovapd      ymm2, [rdi+64]
      vmovapd      ymm3, [rdi+96]
      vblendpd     ymm0,ymm0,ymm1,0010b
      vblendpd     ymm0,ymm0,ymm2,0100b
      vblendpd     ymm0,ymm0,ymm3,1000b
      vhaddpd      ymm0,ymm0,ymm0
      vpermpd      ymm0,ymm0,00100111b
      haddpd       xmm0,xmm0
leave
ret
;------------------------------------------------------
printm4x4:
section .data
      .fmt db      "%f",9,"%f",9, "%f",9,"%f",10,0
section .text
push  rbp
mov   rbp,rsp
push rbx           ;callee saved
push r15           ;callee saved
      mov rdi,.fmt
      mov rcx,4
      xor rbx,rbx         ;row counter
.loop:
      movsd  xmm0, [rsi+rbx]
      movsd  xmm1, [rsi+rbx+8]
      movsd  xmm2, [rsi+rbx+16]
      movsd  xmm3, [rsi+rbx+24]
      mov    rax,4        ;four floats
      push   rcx                 ;caller saved
      push   rsi                 ;caller saved
      push   rdi                 ;caller saved
             ;align stack if needed
             xor r15,r15
             test rsp,0xf        ;last byte is 8 (not aligned)?
             setnz r15b          ;set if not aligned
             shl r15,3           ;multiply by 8
             sub rsp,r15         ;substract 0 or 8
      call   printf
             add rsp,r15         ;add 0 or 8 to restore rsp
      pop    rdi
      pop    rsi
      pop    rcx
      add    rbx,32      ;next row
      loop   .loop
pop r15
pop rbx
leave
ret
;------------------------------------------------------
multi4x4:
push  rbp
mov   rbp,rsp
      xor rax,rax
      mov rcx,4
      vzeroall            ;zero all ymm
.loop:
      vmovapd      ymm0, [rsi]
      vbroadcastsd ymm1,[rdi+rax]
      vfmadd231pd  ymm12,ymm1,ymm0
      vbroadcastsd ymm1,[rdi+32+rax]
      vfmadd231pd  ymm13,ymm1,ymm0
      vbroadcastsd ymm1,[rdi+64+rax]
      vfmadd231pd  ymm14,ymm1,ymm0
      vbroadcastsdymm1,[rdi+96+rax]
      vfmadd231pd  ymm15,ymm1,ymm0
      add rax,8    ;one element has 8 bytes, 64 bits
      add rsi,32   ;every row has 32 bytes, 256 bits
      loop .loop
;move the result to memory, row per row
      vmovapd      [rdx], ymm12
      vmovapd      [rdx+32], ymm13
      vmovapd      [rdx+64], ymm14
      vmovapd      [rdx+96], ymm15
      xor          rax,rax   ;return value
leave
ret
Listing 36-1

matrix4x4.asm

The interesting parts of this code are in the functions. The main function is for initializing the program, calling functions, and printing. The matrices we use in this example are 4×4 double-precision floating-point matrices. Note the 32-byte alignment of the matrices; in AVX we use ymm registers, with a size of 32 bytes. We will analyze the program function by function.

Matrix Print: printm4x4

We read the matrix one row at a time into four xmm registers, and then we push a number of registers onto the stack. These registers will be modified by printf, so we have to preserve them. Then we align the stack on a 16-byte boundary. Because of normal operation, rsp will be aligned on an 8-byte boundary. To align the stack on a 16-byte boundary, we cannot use the trick with the and instruction from Chapter 16. This is because with the and instruction, we do not know whether rsp will be changed or not. And we need the correct stack pointer because we pop the pushed registers after printf. If rsp was changed, we need to return it to its previous value before popping; otherwise, the wrong values will be popped from the stack. If rsp was not changed, we do not need to adjust it.

We will use the test instruction and 0xf to verify the alignment of the stack. If the last hexadecimal digit of rsp is a 0, then rsp is 16-byte aligned. If the last digit contains anything other than 0, then the last half-byte will have at least one of its bits set to 1. The test instruction is similar to an and instruction. If the last half-byte of rsp has one or more bits set to 1, the result of the comparison will be nonzero, and the zero-flag ZF will be cleared. The setnz (set-if-non-zero) instruction reads the zero flag (ZF), and if the ZF is not set, setnz will put 0000 0001 into r15b. If that happens, it means that rsp is not 16-byte aligned, and we will subtract 8 to put it on a 16-byte boundary. We left-shift r15b three times to obtain the decimal value 8 and do the subtraction. After the execution of printf, we restore the correct stack address by adding r15 back to rsp, that is, adding 8 if we had to align or adding 0 if we did not have to align. The stack is then where it was before our alignment, and we can pop the registers.

Matrix Multiplication: multi4x4

In the sample code and in the following explanation, we use the following two matrices:

$$ A=left[egin{array}{cccc}1& 3& 5& 7\ {}9& 11& 13& 15\ {}17& 19& 21& 23\ {}25& 27& 29& 31end{array}
ight] $$     $$ B=left[egin{array}{cccc}2& 4& 6& 8\ {}10& 12& 14& 16\ {}18& 20& 22& 24\ {}26& 28& 30& 32end{array}
ight] $$

If you studied some linear algebra, you probably learned to multiply matrices as follows: to obtain element c11 of matrix C = AB, you compute the following:
$$ {a}_{11}{b}_{11}+{a}_{12}{b}_{21}+{a}_{13}{b}_{31}+{a}_{14}{b}_{41} $$
With our example, it looks like this:
         1x2 + 3x10 + 5x18 + 7x26 = 304
As another example, element c32 would be computed as follows:
$$ {a}_{31}{b}_{12}+{a}_{32}{b}_{22}+{a}_{33}{b}_{32}+{a}_{34}{b}_{42} $$
With our example, it looks like this:
         17x4 + 19x12 + 21x20 + 23x28 = 1360

This is efficient for manual computation; however, we are going to use a method that is more appropriate for a computer. We will use the ymm registers for keeping running totals and for updating the totals in subsequent loops. Here we make use of the power of AVX instructions.

First, we clear all the ymm registers with vzeroall. Then we go into a loop four times, once for every row in matrixB. A row of four double-precision values from matrixB is loaded in ymm0. Then a value from a sequentially selected column of matrixA is broadcasted into ymm1. The register rax serves as a column counter, and the column values are at offset 0, 32, 64, and 96. Broadcasting means that all four quadwords (8 bytes each) will contain that value. Then the values in ymm1 are multiplied with the values in ymm0 and added to ymm12. The multiplying and adding are done with one instruction called vfmadd231pd, which means “vector fused multiply add packed double.” The 231 indicates how the registers are used. There are multiple variants of vfmadd (132, 213, 231), and there are variants for double precision and single precision. We used 231, which means multiply the second operand with the third operand, add to the first operand, and put the result in the first operand. This is done for every column value of the matrixA column, and then the iteration continues; the next row of matrixB is loaded, and the computation restarts.

Walk through the program with your favorite debugger. Look at how the registers ymm12, ymm13, ymm14, and ymm15 keep the running totals, and finally give the product. Your debugger probably will give the values in the ymm registers in hexadecimal and little-endian format. To make it easy, here are the details of what is happening at every step:

rdi

rsi

32 bytes

32 bytes

 

8 bytes

8 bytes

8 bytes

8 bytes

 

8 bytes

8 bytes

8 bytes

8 bytes

0–31

1

3

5

7

0–31

2

4

6

8

32–63

9

11

13

15

32–63

10

12

14

16

64–95

17

19

21

23

64–95

18

20

22

24

96–127

25

27

29

31

96–127

26

28

30

32

Here is the first loop:

vmovapd ymm0, [rsi]

ymm0

2

4

6

8

vbroadcastsd ymm1,[rdi+0]

ymm1

1

1

1

1

vfmadd231pd ymm12,ymm1,ymm0

ymm12

2

4

6

8

vbroadcastsd ymm1,[rdi+32+0]

ymm1

9

9

9

9

vfmadd231pd ymm13,ymm1,ymm0

ymm13

18

36

54

72

vbroadcastsd ymm1,[rdi+64+0]

ymm1

17

17

17

17

vfmadd231pd ymm14,ymm1,ymm0

ymm14

34

68

102

136

vbroadcastsd ymm1,[rdi+96+0]

ymm1

25

25

25

25

vfmadd231pd ymm15,ymm1,ymm0

ymm15

50

100

150

200

Here is the second loop:

vmovapd ymm0, [rsi+32]

ymm0

10

12

14

16

vbroadcastsd ymm1,[rdi+8]

ymm1

3

3

3

3

vfmadd231pd ymm12,ymm1,ymm0

ymm12

32

40

48

56

vbroadcastsd ymm1,[rdi+32+8]

ymm1

11

11

11

11

vfmadd231pd ymm13,ymm1,ymm0

ymm13

128

168

208

248

vbroadcastsd ymm1,[rdi+64+8]

ymm1

19

19

19

19

vfmadd231pd ymm14,ymm1,ymm0

ymm14

224

296

368

440

vbroadcastsd ymm1,[rdi+96+8]

ymm1

27

27

27

27

vfmadd231pd ymm15,ymm1,ymm0

ymm15

320

424

528

632

Here is the third loop:

vmovapd ymm0, [rsi+32+32]

ymm0

18

20

22

24

vbroadcastsd ymm1,[rdi+8+8]

ymm1

5

5

5

5

vfmadd231pd ymm12,ymm1,ymm0

ymm12

122

140

158

176

vbroadcastsd ymm1,[rdi+32+8+8]

ymm1

13

13

13

13

vfmadd231pd ymm13,ymm1,ymm0

ymm13

362

428

494

560

vbroadcastsd ymm1,[rdi+64+8+8]

ymm1

21

21

21

21

vfmadd231pd ymm14,ymm1,ymm0

ymm14

602

716

830

944

vbroadcastsd ymm1,[rdi+96+8+8]

ymm1

29

29

29

29

vfmadd231pd ymm15,ymm1,ymm0

ymm15

842

1004

1166

1328

Here is the fourth and last loop:

vmovapd ymm0, [rsi+32+32+32]

ymm0

26

28

30

32

vbroadcastsd ymm1,[rdi+8+8+8]

ymm1

7

7

7

7

vfmadd231pd ymm12,ymm1,ymm0

ymm12

304

336

368

400

vbroadcastsd ymm1,[rdi+32+8+8+8]

ymm1

15

15

15

15

vfmadd231pd ymm13,ymm1,ymm0

ymm13

752

848

944

1040

vbroadcastsd ymm1,[rdi+64+8+8+8]

ymm1

23

23

23

23

vfmadd231pd ymm14,ymm1,ymm0

ymm14

1200

1360

1520

1680

vbroadcastsd ymm1,[rdi+96+8+8+8]

ymm1

31

31

31

31

vfmadd231pd ymm15,ymm1,ymm0

ymm15

1648

1872

2096

2320

Matrix Inversion: Inverse4x4

Mathematicians have developed a range of algorithms to efficiently compute the inverse of a matrix. It is not our intent to provide you with an inversion program with all the bells and whistles; we just want to show how to use AVX.

We will use a method based on the Cayley-Hamilton theorem about characteristic polynomials. Here is an interesting site with more information on characteristic polynomials: http://www.mcs.csueastbay.edu/~malek/Class/Characteristic.pdf .

Caley-Hamilton Theorem

From the Cayley-Hamilton theorem, we have the following for matrix A:
$$ {A}^n+{p}_1{A}^{n-1}+cdots +{p}_{n-1}A+{p}_nI= 0 $$

where An is A to the power of n. For example, A3 is AAA, the matrix A three times multiplied with itself. The p’s are coefficients to be determined, I is the identity matrix, and 0 is the zero matrix.

Multiply the previous equation by A−1, divide by -pn, rearrange the terms, and you obtain a formula for the inverse, as shown here:
$$ frac{1}{-{p}_n}left[{A}^{n-1}+{p}_1{A}^{n-2}+cdots +{p}_{n-2}A+{p}_{n-1}I 
ight]={A}^{-1} $$

So, to find the inverse of matrix A, we need to do a number of matrix multiplications, and we need a method to find the p’s.

For a 4×4 matrix A, we have the following:
$$ frac{1}{-{p}_4}left[{A}^3+{p}_1{A}^2+{p}_2A+{p}_3I 
ight]={A}^{-1} $$

Leverrier Algorithm

To compute the p coefficients, we use the Leverrier algorithm , also covered at http://www.mcs.csueastbay.edu/~malek/Class/Characteristic.pdf . First, we find the traces of the matrices, that is, the sum of the elements on the diagonal from the upper left to the lower right. Let’s call sn the trace of the matrix An.

For a 4×4 matrix A, we compute the traces of the power matrices of A, as shown here:
  • s1 for A

  • s2 for AA

  • s3 for AAA

  • s4 for AAAA

Leverrier gives us the following then:
$$ {p}_1=-{s}_1 $$
$$ {p}_2=-frac{1}{2}left({p}_1{s}_1+{s}_2
ight) $$
$$ {p}_3=-frac{1}{3}left({p}_2{s}_1+{p}_1 {s}_2+{s}_3
ight) $$
$$ {p}_4=-frac{1}{4}left({p}_3{s}_1+{p}_2 {s}_2+{p}_1{s}_3+{s}_4
ight) $$

Pretty simple, right? Apart from some elaborate matrix multiplications to obtain the traces, of course.

The Code

In our function inverse4x4, we have a separate section .data, where we put our identity matrix and some variables we will use later. First, we compute the power matrices and store them in matrix2, matrix3, and matrix4. We will not use matrix1 yet. Then we call the function vtrace for every matrix to compute the traces. In the vtrace function , we first build our matrix in the ymm registers (ymm0, ymm1, ymm2, ymm3), each containing a row. Then we use the instruction vblendpd, which has four operands: two source operands, one destination operand, and a control mask. We want to extract the diagonal elements in rows 2, 3, and 4 and put them as packed values in ymm0, at locations index 1, 2, and 3. At location 0, we keep the trace element of ymm0.

The mask determines which packed values are selected from the source operands. A 1 in the mask means at this location, select the value from the second source operand. A 0 in the mask means at this location, select the value from the first source operand. See Figure 36-1 for a schematic overview, but note that in the figure we display the values in the registers in such a way that they correspond with the bit mask indexes. In your debugger, you will see that the positions in ymm0 are a1, a0, a3, a2.
../images/483996_1_En_36_Chapter/483996_1_En_36_Fig1_HTML.png
Figure 36-1

Blend mask

In the first trace computation, after the blending, the ymm0 register contains the trace elements 2, 13, 29, 47. You can check this with SASM. Don’t be fooled by the order of the values of ymm0 as represented: 13, 2, 47, 29. We now have to sum these values. This can easily be done by extracting and simply adding, but for the sake of the demo, we will use AVX instructions. We apply the horizontal add instruction vhaddpd. ymm0 then contains 15, 15, 76, 76, which are the sum of the two lower values and the sum of the two higher values. Then we execute a permutation vpermpd with mask 00100111. Each two-bit value selects a value in the source operand; see Figure 36-2 for an explanation. Now the lower half of ymm0, which is xmm0, contains two values, so we have to add these to obtain the trace. We execute a horizontal add on xmm0 with haddpd. We store the traces in xmm8, xmm9, xmm10, and xmm11 for later use.

It’s a bit overkill to obtain the trace, don’t you think? We did it this way just to show a couple of AVX instructions and how to use masks.
../images/483996_1_En_36_Chapter/483996_1_En_36_Fig2_HTML.png
Figure 36-2

Permutation mask

When we have all the traces, we can compute the p-coefficients. See how we change the sign of a value by applying a minus mask and the instruction vxorpd. We use the vfmadd213sd and vfmadd231sd to do additions and multiplications in one instruction. The instruction vfmadd213sd means multiply the first and second operands, add a third operand, and put the result in the first operand. The instruction vfmadd231sd means multiply the second and third operands, add the first operand, and put the result in the first operand. There is a list of similar instructions in the Intel manual. Study them carefully.

When we have all the coefficients, we scalar-multiply matrix, matrix2, matrix3, and matrixI with the coefficients, according to the previous formulae. The result of multiplication with matrix is put into matrix1. We do not need matrix4 anymore, so to save memory, we could have used the space for inverse as temporary memory instead of matrix4.

We have to divide by coefficient p4, so we have to check that p4 is nonzero. In this case, we could have done this simple operation after computing p4 earlier, but we wanted to show how to use the mxcsr register. We set the zero-division mask bit in mxcsr and do the division with the instruction vdivsd. If after division the third bit (index 2) in the mxcsr register is set, then we had a zero division, and the matrix is singular and cannot be inversed. In the and instruction, we used decimal 4, which is 0000 0100 in binary, so we are checking the third bit indeed. If we had a zero division, we head for the exit with 1 in rax to signal the error to the caller.

When a matrix is singular, the program will not crash because zero division is masked by default in the mxcsr register. After you finish the analysis of this code, comment out the part that checks for zero division and see what happens.

If p4 is nonzero, we add the four matrices and scalar-multiply the result with -1/p4. We do the addition and multiplication in the same loop. When everything goes fine, we have the inverse, and we return to the caller with 0 in rax.

Figure 36-3 shows the output.
../images/483996_1_En_36_Chapter/483996_1_En_36_Fig3_HTML.jpg
Figure 36-3

matrix4x4.asm output

Summary

In this chapter, you learned about the following:
  • AVX matrix operations

  • AVX instruction with three operands

  • AVX fuse operations

  • Use of masks for blending and permutations

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.14.251.128