Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

J. Van HoeyBeginning x64 Assembly Programminghttps://doi.org/10.1007/978-1-4842-5076-1_36

36. AVX Matrix Operations

Jo Van Hoey¹

(1)

Hamme, Belgium

Instead of summing up a number of possibly interesting AVX instructions, let’s look at some matrix operations using AVX. This is a long chapter with several pages of code; a lot will be familiar, but we will introduce several new instructions here.

We will show matrix multiplication and matrix inversion. In the next chapter, we will show how to transpose a matrix.

Example Matrix Code

Listing 36-1 shows the example code.

; matrix4x4.asm

extern printf

section .data

fmt0 db 10,"4x4 DOUBLE PRECISION FLOATING POINT MATRICES",10,0

fmt1 db 10,"This is matrixA:",10,0

fmt2 db 10,"This is matrixB:",10,0

fmt3 db 10,"This is matrixA x matrixB:",10,0

fmt4 db 10,"This is matrixC:",10,0

fmt5 db 10,"This is the inverse of matrixC:",10,0

fmt6 db 10,"Proof: matrixC x inverse =",10,0

fmt7 db 10,"This is matrixS:",10,0

fmt8 db 10,"This is the inverse of matrixS:",10,0

fmt9 db 10,"Proof: matrixS x inverse =",10,0

fmt10 db 10,"This matrix is singular!",10,10,0

align 32

matrixA dq 1., 3., 5., 7.

dq 9., 11., 13., 15.

dq 17., 19., 21., 23.

dq 25., 27., 29., 31.

matrixB dq 2., 4., 6., 8.

dq 10., 12., 14., 16.

dq 18., 20., 22., 24.

dq 26., 28., 30., 32.

matrixC dq 2., 11., 21., 37.

dq 3., 13., 23., 41.

dq 5., 17., 29., 43.

dq 7., 19., 31., 47.

matrixS dq 1., 2., 3., 4.

dq 5., 6., 7., 8.

dq 9., 10., 11., 12.

dq 13., 14., 15., 16.

section .bss

alignb 32

product resq 16

inverse resq 16

section .text

global main

main:

push rbp

mov rbp,rsp

; print title

mov rdi, fmt0

call printf

; print matrixA

mov rdi,fmt1

call printf

mov rsi,matrixA

call printm4x4

; print matrixB

mov rdi,fmt2

call printf

mov rsi,matrixB

call printm4x4

; compute the product matrixA x matrixB

mov rdi,matrixA

mov rsi,matrixB

mov rdx,product

call multi4x4

; print the product

mov rdi,fmt3

call printf

mov rsi,product

call printm4x4

; print matrixC

mov rdi,fmt4

call printf

mov rsi,matrixC

call printm4x4

; compute the inverse of matrixC

mov rdi,matrixC

mov rsi,inverse

call inverse4x4

cmp rax,1

je singular

; print the inverse

mov rdi,fmt5

call printf

mov rsi,inverse

call printm4x4

; proof multiply matrixC and inverse

mov rsi,matrixC

mov rdi,inverse

mov rdx,product

call multi4x4

; print the proof

mov rdi,fmt6

call printf

mov rsi,product

call printm4x4

; Singular matrix

; print matrixS

mov rdi,fmt7

call printf

mov rsi,matrixS

call printm4x4

; compute the inverse of matrixS

mov rdi,matrixS

mov rsi,inverse

call inverse4x4

cmp rax,1

je singular

; print the inverse

mov rdi,fmt8

call printf

mov rsi,inverse

call printm4x4

; proof multiply matrixS and inverse

mov rsi,matrixS

mov rdi,inverse

mov rdx,product

call multi4x4

; print the proof

mov rdi,fmt9

call printf

mov rsi,product

call printm4x4

jmp exit

singular:

; print error

mov rdi,fmt10

call printf

exit:

leave

ret

inverse4x4:

section .data

align 32

.identity dq 1., 0., 0., 0.

dq 0., 1., 0., 0.

dq 0., 0., 1., 0.

dq 0., 0., 0., 1.

.minus_mask dq 8000000000000000h

.size dq 4 ;4 x 4 matrices

.one dq 1.0

.two dq 2.0

.three dq 3.0

.four dq 4.0

section .bss

alignb 32

.matrix1 resq 16 ;intermediate matrix

.matrix2 resq 16 ;intermediate matrix

.matrix3 resq 16 ;intermediate matrix

.matrix4 resq 16 ;intermediate matrix

.matrixI resq 16

.mxcsr resd 1 ;used for checking zero division

section .text

push rbp

mov rbp,rsp

push rsi ;save address of inverse matrix

vzeroall ;clear all ymm registers

; compute the intermediate matrices

; compute the intermediate matrix2

; rdi contains address of the original matrix

mov rsi,rdi

mov rdx,.matrix2

push rdi

call multi4x4

pop rdi

; compute the intermediate matrix3

mov rsi,.matrix2

mov rdx,.matrix3

push rdi

call multi4x4

pop rdi

; compute the intermediate matrix4

mov rsi,.matrix3

mov rdx,.matrix4

push rdi

call multi4x4

pop rdi

;compute the traces

;compute trace1

mov rsi,[.size]

call vtrace

movsd xmm8,xmm0 ;trace 1 in xmm8

;compute trace2

push rdi ; save address of the original matrix

mov rdi,.matrix2

mov rsi,[.size]

call vtrace

movsd xmm9,xmm0 ;trace 2 in xmm9

;compute trace3

mov rdi,.matrix3

mov rsi,[.size]

call vtrace

movsd xmm10,xmm0 ;trace 3 in xmm10

;compute trace4

mov rdi,.matrix4

mov rsi,[.size]

call vtrace

movsd xmm11,xmm0 ;trace 4 in xmm11

; compute the coefficients

; compute coefficient p1

; p1 = -s1

vxorpd xmm12,xmm8,[.minus_mask] ;p1 in xmm12

; compute coefficient p2

; p2 = -1/2 * (p1 * s1 + s2)

movsd xmm13,xmm12 ;copy p1 to xmm13

vfmadd213sd xmm13,xmm8,xmm9 ;xmm13=xmm13*xmm8+xmm9

vxorpd xmm13,xmm13,[.minus_mask]

divsd xmm13,[.two] ;divide by 2 and p2 in xmm13

; compute coefficient p3

; p3 = -1/3 * (p2 * s1 + p1 * s2 + s3)

movsd xmm14,xmm12 ;copy p1 to xmm14

vfmadd213sd xmm14,xmm9,xmm10 ;p1*s2+s3;xmm14=xmm14*xmm9+xmm10

vfmadd231sd xmm14,xmm13,xmm8 ;xmm14+p2*s1;xmm14=xmm14+xmm13*xmm8

vxorpd xmm14,xmm14,[.minus_mask]

divsd xmm14,[.three] ;p3 in xmm14

; compute coefficient p4

; p4 = -1/4 * (p3 * s1 + p2 * s2 + p1 * s3 + s4)

movsd xmm15,xmm12 ;copy p1 to xmm15

vfmadd213sd xmm15,xmm10,xmm11 ;p1*s3+s4;xmm15=xmm15*xmm10+xmm11

vfmadd231sd xmm15,xmm13,xmm9 ;xmm15+p2*s2;xmm15=xmm15+xmm13*xmm9

vfmadd231sd xmm15,xmm14,xmm8 ;xmm15+p3*s1;xmm15=xmm15+xmm14*xmm8

vxorpd xmm15,xmm15,[.minus_mask]

divsd xmm15,[.four] ;p4 in xmm15

;multiply matrices with proper coefficient

mov rcx,[.size]

xor rax,rax

vbroadcastsd ymm1,xmm12 ; p1

vbroadcastsd ymm2,xmm13 ; p2

vbroadcastsd ymm3,xmm14 ; p3

pop rdi ; restore the address of the original matrix

.loop1:

vmovapd ymm0,[rdi+rax]

vmulpd ymm0,ymm0,ymm2

vmovapd [.matrix1+rax],ymm0

vmovapd ymm0,[.matrix2+rax]

vmulpd ymm0,ymm0,ymm1

vmovapd [.matrix2+rax],ymm0

vmovapd ymm0,[.identity+rax]

vmulpd ymm0,ymm0,ymm3

vmovapd [.matrixI+rax],ymm0

add rax,32

loop .loop1

;add the four matrices and multiply by -1/p4

mov rcx,[.size]

xor rax,rax

;compute -1/p4

movsd xmm0, [.one]

vdivsd xmm0,xmm0,xmm15 ;1/p4

;check for zero division

stmxcsr [.mxcsr]

and dword[.mxcsr],4

jnz .singular

; no zero division

pop rsi ;recall address of inverse matrix

vxorpd xmm0,xmm0,[.minus_mask] ;-1/p4

vbroadcastsd ymm2,xmm0

;loop through the rows

.loop2:

;add the rows

vmovapd ymm0,[.matrix1+rax]

vaddpd ymm0, ymm0, [.matrix2+rax]

vaddpd ymm0, ymm0, [.matrix3+rax]

vaddpd ymm0, ymm0, [.matrixI+rax]

vmulpd ymm0,ymm0,ymm2 ;multiply the row with -1/p4

vmovapd [rsi+rax],ymm0

add rax,32

loop .loop2

xor rax,rax ;return 0, no error

leave

ret

.singular:

mov rax,1 ;return 1, singular matrix

leave

ret

;------------------------------------------------------

; trace computation

vtrace:

push rbp

mov rbp,rsp

;build the matrix in memory

vmovapd ymm0, [rdi]

vmovapd ymm1, [rdi+32]

vmovapd ymm2, [rdi+64]

vmovapd ymm3, [rdi+96]

vblendpd ymm0,ymm0,ymm1,0010b

vblendpd ymm0,ymm0,ymm2,0100b

vblendpd ymm0,ymm0,ymm3,1000b

vhaddpd ymm0,ymm0,ymm0

vpermpd ymm0,ymm0,00100111b

haddpd xmm0,xmm0

leave

ret

;------------------------------------------------------

printm4x4:

section .data

.fmt db "%f",9,"%f",9, "%f",9,"%f",10,0

section .text

push rbp

mov rbp,rsp

push rbx ;callee saved

push r15 ;callee saved

mov rdi,.fmt

mov rcx,4

xor rbx,rbx ;row counter

.loop:

movsd xmm0, [rsi+rbx]

movsd xmm1, [rsi+rbx+8]

movsd xmm2, [rsi+rbx+16]

movsd xmm3, [rsi+rbx+24]

mov rax,4 ;four floats

push rcx ;caller saved

push rsi ;caller saved

push rdi ;caller saved

;align stack if needed

xor r15,r15

test rsp,0xf ;last byte is 8 (not aligned)?

setnz r15b ;set if not aligned

shl r15,3 ;multiply by 8

sub rsp,r15 ;substract 0 or 8

call printf

add rsp,r15 ;add 0 or 8 to restore rsp

pop rdi

pop rsi

pop rcx

add rbx,32 ;next row

loop .loop

pop r15

pop rbx

leave

ret

;------------------------------------------------------

multi4x4:

push rbp

mov rbp,rsp

xor rax,rax

mov rcx,4

vzeroall ;zero all ymm

.loop:

vmovapd ymm0, [rsi]

vbroadcastsd ymm1,[rdi+rax]

vfmadd231pd ymm12,ymm1,ymm0

vbroadcastsd ymm1,[rdi+32+rax]

vfmadd231pd ymm13,ymm1,ymm0

vbroadcastsd ymm1,[rdi+64+rax]

vfmadd231pd ymm14,ymm1,ymm0

vbroadcastsdymm1,[rdi+96+rax]

vfmadd231pd ymm15,ymm1,ymm0

add rax,8 ;one element has 8 bytes, 64 bits

add rsi,32 ;every row has 32 bytes, 256 bits

loop .loop

;move the result to memory, row per row

vmovapd [rdx], ymm12

vmovapd [rdx+32], ymm13

vmovapd [rdx+64], ymm14

vmovapd [rdx+96], ymm15

xor rax,rax ;return value

leave

ret

Listing 36-1

matrix4x4.asm

The interesting parts of this code are in the functions. The main function is for initializing the program, calling functions, and printing. The matrices we use in this example are 4×4 double-precision floating-point matrices. Note the 32-byte alignment of the matrices; in AVX we use ymm registers, with a size of 32 bytes. We will analyze the program function by function.

Matrix Print: printm4x4

We read the matrix one row at a time into four xmm registers, and then we push a number of registers onto the stack. These registers will be modified by printf, so we have to preserve them. Then we align the stack on a 16-byte boundary. Because of normal operation, rsp will be aligned on an 8-byte boundary. To align the stack on a 16-byte boundary, we cannot use the trick with the and instruction from Chapter 16. This is because with the and instruction, we do not know whether rsp will be changed or not. And we need the correct stack pointer because we pop the pushed registers after printf. If rsp was changed, we need to return it to its previous value before popping; otherwise, the wrong values will be popped from the stack. If rsp was not changed, we do not need to adjust it.

We will use the test instruction and 0xf to verify the alignment of the stack. If the last hexadecimal digit of rsp is a 0, then rsp is 16-byte aligned. If the last digit contains anything other than 0, then the last half-byte will have at least one of its bits set to 1. The test instruction is similar to an and instruction. If the last half-byte of rsp has one or more bits set to 1, the result of the comparison will be nonzero, and the zero-flag ZF will be cleared. The setnz (set-if-non-zero) instruction reads the zero flag (ZF), and if the ZF is not set, setnz will put 0000 0001 into r15b. If that happens, it means that rsp is not 16-byte aligned, and we will subtract 8 to put it on a 16-byte boundary. We left-shift r15b three times to obtain the decimal value 8 and do the subtraction. After the execution of printf, we restore the correct stack address by adding r15 back to rsp, that is, adding 8 if we had to align or adding 0 if we did not have to align. The stack is then where it was before our alignment, and we can pop the registers.

Matrix Multiplication: multi4x4

In the sample code and in the following explanation, we use the following two matrices:

$A=left[egin{array}{cccc}1& 3& 5& 7\ {}9& 11& 13& 15\ {}17& 19& 21& 23\ {}25& 27& 29& 31end{array} ight]$ $B=left[egin{array}{cccc}2& 4& 6& 8\ {}10& 12& 14& 16\ {}18& 20& 22& 24\ {}26& 28& 30& 32end{array} ight]$

If you studied some linear algebra, you probably learned to multiply matrices as follows: to obtain element c₁₁ of matrix C = AB, you compute the following:

${a}_{11}{b}_{11}+{a}_{12}{b}_{21}+{a}_{13}{b}_{31}+{a}_{14}{b}_{41}$

With our example, it looks like this:

1x2 + 3x10 + 5x18 + 7x26 = 304

As another example, element c₃₂ would be computed as follows:

${a}_{31}{b}_{12}+{a}_{32}{b}_{22}+{a}_{33}{b}_{32}+{a}_{34}{b}_{42}$

With our example, it looks like this:

17x4 + 19x12 + 21x20 + 23x28 = 1360

This is efficient for manual computation; however, we are going to use a method that is more appropriate for a computer. We will use the ymm registers for keeping running totals and for updating the totals in subsequent loops. Here we make use of the power of AVX instructions.

First, we clear all the ymm registers with vzeroall. Then we go into a loop four times, once for every row in matrixB. A row of four double-precision values from matrixB is loaded in ymm0. Then a value from a sequentially selected column of matrixA is broadcasted into ymm1. The register rax serves as a column counter, and the column values are at offset 0, 32, 64, and 96. Broadcasting means that all four quadwords (8 bytes each) will contain that value. Then the values in ymm1 are multiplied with the values in ymm0 and added to ymm12. The multiplying and adding are done with one instruction called vfmadd231pd, which means “vector fused multiply add packed double.” The 231 indicates how the registers are used. There are multiple variants of vfmadd (132, 213, 231), and there are variants for double precision and single precision. We used 231, which means multiply the second operand with the third operand, add to the first operand, and put the result in the first operand. This is done for every column value of the matrixA column, and then the iteration continues; the next row of matrixB is loaded, and the computation restarts.

Walk through the program with your favorite debugger. Look at how the registers ymm12, ymm13, ymm14, and ymm15 keep the running totals, and finally give the product. Your debugger probably will give the values in the ymm registers in hexadecimal and little-endian format. To make it easy, here are the details of what is happening at every step:

rdi					rsi
32 bytes					32 bytes
	8 bytes	8 bytes	8 bytes	8 bytes		8 bytes	8 bytes	8 bytes	8 bytes
0–31	1	3	5	7	0–31	2	4	6	8
32–63	9	11	13	15	32–63	10	12	14	16
64–95	17	19	21	23	64–95	18	20	22	24
96–127	25	27	29	31	96–127	26	28	30	32

Here is the first loop:

vmovapd ymm0, [rsi]	ymm0	2	4	6	8
vbroadcastsd ymm1,[rdi+0]	ymm1	1	1	1	1
vfmadd231pd ymm12,ymm1,ymm0	ymm12	2	4	6	8
vbroadcastsd ymm1,[rdi+32+0]	ymm1	9	9	9	9
vfmadd231pd ymm13,ymm1,ymm0	ymm13	18	36	54	72
vbroadcastsd ymm1,[rdi+64+0]	ymm1	17	17	17	17
vfmadd231pd ymm14,ymm1,ymm0	ymm14	34	68	102	136
vbroadcastsd ymm1,[rdi+96+0]	ymm1	25	25	25	25
vfmadd231pd ymm15,ymm1,ymm0	ymm15	50	100	150	200

Here is the second loop:

vmovapd ymm0, [rsi+32]	ymm0	10	12	14	16
vbroadcastsd ymm1,[rdi+8]	ymm1	3	3	3	3
vfmadd231pd ymm12,ymm1,ymm0	ymm12	32	40	48	56
vbroadcastsd ymm1,[rdi+32+8]	ymm1	11	11	11	11
vfmadd231pd ymm13,ymm1,ymm0	ymm13	128	168	208	248
vbroadcastsd ymm1,[rdi+64+8]	ymm1	19	19	19	19
vfmadd231pd ymm14,ymm1,ymm0	ymm14	224	296	368	440
vbroadcastsd ymm1,[rdi+96+8]	ymm1	27	27	27	27
vfmadd231pd ymm15,ymm1,ymm0	ymm15	320	424	528	632

Here is the third loop:

vmovapd ymm0, [rsi+32+32]	ymm0	18	20	22	24
vbroadcastsd ymm1,[rdi+8+8]	ymm1	5	5	5	5
vfmadd231pd ymm12,ymm1,ymm0	ymm12	122	140	158	176
vbroadcastsd ymm1,[rdi+32+8+8]	ymm1	13	13	13	13
vfmadd231pd ymm13,ymm1,ymm0	ymm13	362	428	494	560
vbroadcastsd ymm1,[rdi+64+8+8]	ymm1	21	21	21	21
vfmadd231pd ymm14,ymm1,ymm0	ymm14	602	716	830	944
vbroadcastsd ymm1,[rdi+96+8+8]	ymm1	29	29	29	29
vfmadd231pd ymm15,ymm1,ymm0	ymm15	842	1004	1166	1328

Here is the fourth and last loop:

vmovapd ymm0, [rsi+32+32+32]	ymm0	26	28	30	32
vbroadcastsd ymm1,[rdi+8+8+8]	ymm1	7	7	7	7
vfmadd231pd ymm12,ymm1,ymm0	ymm12	304	336	368	400
vbroadcastsd ymm1,[rdi+32+8+8+8]	ymm1	15	15	15	15
vfmadd231pd ymm13,ymm1,ymm0	ymm13	752	848	944	1040
vbroadcastsd ymm1,[rdi+64+8+8+8]	ymm1	23	23	23	23
vfmadd231pd ymm14,ymm1,ymm0	ymm14	1200	1360	1520	1680
vbroadcastsd ymm1,[rdi+96+8+8+8]	ymm1	31	31	31	31
vfmadd231pd ymm15,ymm1,ymm0	ymm15	1648	1872	2096	2320

Matrix Inversion: Inverse4x4

Mathematicians have developed a range of algorithms to efficiently compute the inverse of a matrix. It is not our intent to provide you with an inversion program with all the bells and whistles; we just want to show how to use AVX.

We will use a method based on the Cayley-Hamilton theorem about characteristic polynomials. Here is an interesting site with more information on characteristic polynomials: http://www.mcs.csueastbay.edu/~malek/Class/Characteristic.pdf .

Caley-Hamilton Theorem

From the Cayley-Hamilton theorem, we have the following for matrix A:

${A}^n+{p}_1{A}^{n-1}+cdots +{p}_{n-1}A+{p}_nI= 0$

where Aⁿ is A to the power of n. For example, A³ is AAA, the matrix A three times multiplied with itself. The p’s are coefficients to be determined, I is the identity matrix, and 0 is the zero matrix.

Multiply the previous equation by A⁻¹, divide by -p_n, rearrange the terms, and you obtain a formula for the inverse, as shown here:

$frac{1}{-{p}_n}left[{A}^{n-1}+{p}_1{A}^{n-2}+cdots +{p}_{n-2}A+{p}_{n-1}I ight]={A}^{-1}$

So, to find the inverse of matrix A, we need to do a number of matrix multiplications, and we need a method to find the p’s.

For a 4×4 matrix A, we have the following:

$frac{1}{-{p}_4}left[{A}^3+{p}_1{A}^2+{p}_2A+{p}_3I ight]={A}^{-1}$

Leverrier Algorithm

To compute the p coefficients, we use the Leverrier algorithm , also covered at http://www.mcs.csueastbay.edu/~malek/Class/Characteristic.pdf . First, we find the traces of the matrices, that is, the sum of the elements on the diagonal from the upper left to the lower right. Let’s call s_n the trace of the matrix Aⁿ.

For a 4×4 matrix A, we compute the traces of the power matrices of A, as shown here:

s₁ for A
s₂ for AA
s₃ for AAA
s₄ for AAAA

Leverrier gives us the following then:

${p}_1=-{s}_1$

${p}_2=-frac{1}{2}left({p}_1{s}_1+{s}_2 ight)$

${p}_3=-frac{1}{3}left({p}_2{s}_1+{p}_1 {s}_2+{s}_3 ight)$

${p}_4=-frac{1}{4}left({p}_3{s}_1+{p}_2 {s}_2+{p}_1{s}_3+{s}_4 ight)$

Pretty simple, right? Apart from some elaborate matrix multiplications to obtain the traces, of course.

The Code

In our function inverse4x4, we have a separate section .data, where we put our identity matrix and some variables we will use later. First, we compute the power matrices and store them in matrix2, matrix3, and matrix4. We will not use matrix1 yet. Then we call the function vtrace for every matrix to compute the traces. In the vtrace function , we first build our matrix in the ymm registers (ymm0, ymm1, ymm2, ymm3), each containing a row. Then we use the instruction vblendpd, which has four operands: two source operands, one destination operand, and a control mask. We want to extract the diagonal elements in rows 2, 3, and 4 and put them as packed values in ymm0, at locations index 1, 2, and 3. At location 0, we keep the trace element of ymm0.

The mask determines which packed values are selected from the source operands. A 1 in the mask means at this location, select the value from the second source operand. A 0 in the mask means at this location, select the value from the first source operand. See Figure 36-1 for a schematic overview, but note that in the figure we display the values in the registers in such a way that they correspond with the bit mask indexes. In your debugger, you will see that the positions in ymm0 are a1, a0, a3, a2.

../images/483996_1_En_36_Chapter/483996_1_En_36_Fig1_HTML.png — Figure 36-1
Blend mask

In the first trace computation, after the blending, the ymm0 register contains the trace elements 2, 13, 29, 47. You can check this with SASM. Don’t be fooled by the order of the values of ymm0 as represented: 13, 2, 47, 29. We now have to sum these values. This can easily be done by extracting and simply adding, but for the sake of the demo, we will use AVX instructions. We apply the horizontal add instruction vhaddpd. ymm0 then contains 15, 15, 76, 76, which are the sum of the two lower values and the sum of the two higher values. Then we execute a permutation vpermpd with mask 00100111. Each two-bit value selects a value in the source operand; see Figure 36-2 for an explanation. Now the lower half of ymm0, which is xmm0, contains two values, so we have to add these to obtain the trace. We execute a horizontal add on xmm0 with haddpd. We store the traces in xmm8, xmm9, xmm10, and xmm11 for later use.

It’s a bit overkill to obtain the trace, don’t you think? We did it this way just to show a couple of AVX instructions and how to use masks.

../images/483996_1_En_36_Chapter/483996_1_En_36_Fig2_HTML.png — Figure 36-2
Permutation mask

When we have all the traces, we can compute the p-coefficients. See how we change the sign of a value by applying a minus mask and the instruction vxorpd. We use the vfmadd213sd and vfmadd231sd to do additions and multiplications in one instruction. The instruction vfmadd213sd means multiply the first and second operands, add a third operand, and put the result in the first operand. The instruction vfmadd231sd means multiply the second and third operands, add the first operand, and put the result in the first operand. There is a list of similar instructions in the Intel manual. Study them carefully.

When we have all the coefficients, we scalar-multiply matrix, matrix2, matrix3, and matrixI with the coefficients, according to the previous formulae. The result of multiplication with matrix is put into matrix1. We do not need matrix4 anymore, so to save memory, we could have used the space for inverse as temporary memory instead of matrix4.

We have to divide by coefficient p₄, so we have to check that p₄ is nonzero. In this case, we could have done this simple operation after computing p₄ earlier, but we wanted to show how to use the mxcsr register. We set the zero-division mask bit in mxcsr and do the division with the instruction vdivsd. If after division the third bit (index 2) in the mxcsr register is set, then we had a zero division, and the matrix is singular and cannot be inversed. In the and instruction, we used decimal 4, which is 0000 0100 in binary, so we are checking the third bit indeed. If we had a zero division, we head for the exit with 1 in rax to signal the error to the caller.

When a matrix is singular, the program will not crash because zero division is masked by default in the mxcsr register. After you finish the analysis of this code, comment out the part that checks for zero division and see what happens.

If p₄ is nonzero, we add the four matrices and scalar-multiply the result with -1/p₄. We do the addition and multiplication in the same loop. When everything goes fine, we have the inverse, and we return to the caller with 0 in rax.

Figure 36-3 shows the output.

../images/483996_1_En_36_Chapter/483996_1_En_36_Fig3_HTML.jpg — Figure 36-3
matrix4x4.asm output

Summary

In this chapter, you learned about the following:

AVX matrix operations
AVX instruction with three operands
AVX fuse operations
Use of masks for blending and permutations

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.

Table of Contents for 36. AVX Matrix Operations

Create new playlist

Sign In

Sign Up

36. AVX Matrix Operations

Example Matrix Code

Matrix Print: printm4x4

Matrix Multiplication: multi4x4

Matrix Inversion: Inverse4x4

Caley-Hamilton Theorem

Leverrier Algorithm

The Code

Summary

Table of Contents for
36. AVX Matrix Operations