Advanced Vector Extensions (AVX) is an extension of SSE. Whereas SSE provides 16 xmm registers, each 128 bits wide, AVX offers 16 ymm registers, each 256 bits wide. The lower half of each ymm register is in fact the corresponding xmm register. The xmm registers are aliases of the ymm registers. AVX-512 is a further extension offering 32 zmm registers, each 512 bits wide.
In addition to these registers, AVX extends the SSE instructions and provides a whole range of additional new instructions. After you work your way through the SSE chapters in this book, you will not find it too difficult to navigate the large number of SSE and AVX instructions.
In this chapter, we will first explain which AVX version is supported by the processor, and then we will show an example program.
Test for AVX Support
Listing 35-1 shows a program to find out whether your CPU supports AVX.
; cpu_avx.asm
extern printf
section .data
fmt_noavx db "This cpu does not support AVX.",10,0
fmt_avx db "This cpu supports AVX.",10,0
fmt_noavx2 db "This cpu does not support AVX2.",10,0
fmt_avx2 db "This cpu supports AVX2.",10,0
fmt_noavx512 db "This cpu does not support AVX-512.",10,0
fmt_avx512 db "This cpu supports AVX-512.",10,0
section .bss
section .text
global main
main:
push rbp
mov rbp,rsp
call cpu_sse ; returns 1 in rax if AVX supported, otherwise 0
leave
ret
cpu_sse:
push rbp
mov rbp,rsp
;test for avx
mov eax,1 ; request CPU feature flags
cpuid
mov eax,28 ; test bit 28 in ecx
bt ecx,eax
jnc no_avx
xor rax,rax
mov rdi,fmt_avx
call printf
;test for avx2
mov eax,7 ; request CPU feature flags
mov ecx,0
cpuid
mov eax,5 ; test bit 5 in ebx
bt ebx,eax
jnc the_exit
xor rax,rax
mov rdi,fmt_avx2
call printf
;test for avx512 foundation
mov eax,7 ; request CPU feature flags
mov ecx,0
cpuid
mov eax,16 ; test bit 16 in ebx
bt ebx,eax
jnc no_avx512
xor rax,rax
mov rdi,fmt_avx512
call printf
jmp the_exit
no_avx:
mov rdi,fmt_noavx
xor rax,rax
call printf ; displays message if AVX not available
xor rax,rax ; returns 0, no AVX
jmp the_exit ; and exits
no_avx2:
mov rdi,fmt_noavx2
xor rax,rax
call printf ; displays message if AVX not available
xor rax,rax ; returns 0, no AVX
jmp the_exit ; and exits
no_avx512:
mov rdi,fmt_noavx512
xor rax,rax
call printf ; displays message if AVX not available
xor rax,rax ; returns 0, no AVX
jmp the_exit ; and exits
the_exit:
leave
ret
Listing 35-1
cpu_avx.asm
This program is similar to the program we used to test for SSE support, but we have to look for AVX flags now. So, there is nothing special here; you can find more details of which registers to use and what information can be retrieved in the Intel manual, Volume 2, in the section on cpuid.
Listing 35-2 is adapted from the SSE unaligned example in Chapter 28.
; avx_unaligned.asm
extern printf
section .data
spvector1 dd 1.1
dd 2.1
dd 3.1
dd 4.1
dd 5.1
dd 6.1
dd 7.1
dd 8.1
spvector2 dd 1.2
dd 1.2
dd 3.2
dd 4.2
dd 5.2
dd 6.2
dd 7.2
dd 8.2
dpvector1 dq 1.1
dq 2.2
dq 3.3
dq 4.4
dpvector2 dq 5.5
dq 6.6
dq 7.7
dq 8.8
fmt1 db "Single Precision Vector 1:",10,0
fmt2 db 10,"Single Precision Vector 2:",10,0
fmt3 db 10,"Sum of Single Precision Vector 1 and Vector 2:",10,0
fmt4 db 10,"Double Precision Vector 1:",10,0
fmt5 db 10,"Double Precision Vector 2:",10,0
fmt6 db 10,"Sum of Double Precision Vector 1 and Vector 2:",10,0
section .bss
spvector_res resd 8
dpvector_res resq 4
section .text
global main
main:
push rbp
mov rbp,rsp
;SINGLE PRECISION FLOATING POINT VECTORS
;load vector1 in the register ymm0
vmovups ymm0, [spvector1]
;extract ymm0
vextractf128 xmm2,ymm0,0 ;first part of ymm0
vextractf128 xmm2,ymm0,1 ;second part of ymm0
;load vector2 in the register ymm1
vmovups ymm1, [spvector2]
;extract ymm1
vextractf128 xmm2,ymm1,0
vextractf128 xmm2,ymm1,1
;add 2 single precision floating point vectors
vaddps ymm2,ymm0,ymm1
vmovups [spvector_res],ymm2
;print the vectors
mov rdi,fmt1
call printf
mov rsi,spvector1
call printspfpv
mov rdi,fmt2
call printf
mov rsi,spvector2
call printspfpv
mov rdi,fmt3
call printf
mov rsi,spvector_res
call printspfpv
;DOUBLE PRECISION FLOATING POINT VECTORS
;load vector1 in the register ymm0
vmovups ymm0, [dpvector1]
;extract ymm0
vextractf128 xmm2,ymm0,0 ;first part of ymm0
vextractf128 xmm2,ymm0,1 ;second part of ymm0
;load vector2 in the register ymm1
vmovups ymm1, [dpvector2]
;extract ymm1
vextractf128 xmm2,ymm1,0
vextractf128 xmm2,ymm1,1
; add 2 double precision floating point vectors
vaddpd ymm2,ymm0,ymm1
vmovupd [dpvector_res],ymm2
;print the vectors
mov rdi,fmt4
call printf
mov rsi,dpvector1
call printdpfpv
mov rdi,fmt5
call printf
mov rsi,dpvector2
call printdpfpv
mov rdi,fmt6
call printf
mov rsi,dpvector_res
call printdpfpv
leave
ret
printspfpv:
section .data
.NL db 10,0
.fmt1 db "%.1f, ",0
section .text
push rbp
mov rbp,rsp
push rcx
push rbx
mov rcx,8
mov rbx,0
mov rax,1
.loop:
movss xmm0,[rsi+rbx]
cvtss2sd xmm0,xmm0
mov rdi,.fmt1
push rsi
push rcx
call printf
pop rcx
pop rsi
add rbx,4
loop .loop
xor rax,rax
mov rdi,.NL
call printf
pop rbx
pop rcx
leave
ret
printdpfpv:
section .data
.NL db 10,0
.fmt db "%.1f, %.1f, %.1f, %.1f",0
section .text
push rbp
mov rbp,rsp
mov rdi,.fmt
mov rax,4 ; four floats
call printf
mov rdi,.NL
call printf
leave
ret
Listing 35-2
avx_unaligned.asm
In this program, we use the 256-bit ymm registers and some new instructions. For example, we use vmovups to put unaligned data in a ymm register. We use SASM to view the registers. After the vmovups instructions, ymm0 contains the following:
Look at where the values are stored, which can be confusing.
Just for the sake of the demo, we extract data from a ymm register, and we use vextractf128 to put packed floating-point values from ymm0 to xmm2, 128 bits at a time. You could use extractps to further extract floating-point values and store them in general-purpose registers.
New are instructions with three operands, as shown here:
vaddps ymm2,ymm0,ymm1
Add ymm1 to ymm0 and store the result in ymm2.
The print functions simply load the values from memory into an xmm register, convert single precision to double precision where needed, and then call printf.