© Jo Van Hoey 2019
J. Van HoeyBeginning x64 Assembly Programminghttps://doi.org/10.1007/978-1-4842-5076-1_37

37. Matrix Transpose

Jo Van Hoey1 
(1)
Hamme, Belgium
 

Let’s do one last matrix operation that is useful: transposing. We have coded two versions, one using unpacking and one using shuffling.

Example Transposing Code

Listing 37-1 shows the code.
; transpose4x4.asm
extern printf
section .data
      fmt0   db      "4x4 DOUBLE PRECISION FLOATING POINT MATRIX TRANSPOSE",10,0
      fmt1   db      10,"This is the matrix:",10,0
      fmt2   db      10,"This is the transpose (unpack):",10,0
      fmt3   db      10,"This is the transpose (shuffle):",10,0
      align  32
      matrix dq       1.,     2.,     3.,     4.
             dq       5.,     6.,     7.,     8.
             dq       9.,    10.,    11.,    12.
             dq      13.,    14.,    15.,    16.
section .bss
      alignb       32
      transpose    resd   16
section .text
      global main
main:
push  rbp
mov   rbp,rsp
; print title
      mov    rdi, fmt1
      call   printf
; print matrix
      mov    rdi,fmt1
      call   printf
      mov    rsi,matrix
      call   printm4x4
; compute transpose unpack
      mov    rdi, matrix
      mov    rsi, transpose
      call   transpose_unpack_4x4
;print the result
      mov    rdi, fmt2
      xor    rax,rax
      call   printf
      mov    rsi, transpose
      call   printm4x4
; compute transpose shuffle
      mov    rdi, matrix
      mov    rsi, transpose
      call   transpose_shuffle_4x4
;print the result
      mov    rdi, fmt3
      xor    rax,rax
      call   printf
      mov    rsi, transpose
      call   printm4x4
leave
ret
;--------------------------------------------------------
transpose_unpack_4x4:
push  rbp
mov   rbp,rsp
;load matrix into the registers
      vmovapd       ymm0,[rdi]    ;  1   2   3   4
      vmovapd       ymm1,[rdi+32] ;  5   6   7   8
      vmovapd       ymm2,[rdi+64] ;  9  10  11  12
      vmovapd       ymm3,[rdi+96] ; 13  14  15  16
;unpack
      vunpcklpd ymm12,ymm0,ymm1         ;  1   5   3   7
      vunpckhpd ymm13,ymm0,ymm1         ;  2   6   4   8
      vunpcklpd ymm14,ymm2,ymm3         ;  9  13  11  15
      vunpckhpd ymm15,ymm2,ymm3         ; 10  14  12  16
;permutate
      vperm2f128 ymm0,ymm12,ymm14, 00100000b    ; 1   5   9  13
      vperm2f128 ymm1,ymm13,ymm15, 00100000b    ; 2   6  10  14
      vperm2f128 ymm2,ymm12,ymm14, 00110001b    ; 3   7  11  15
      vperm2f128 ymm3,ymm13,ymm15, 00110001b    ; 4   8  12  16
;write to memory
      vmovapd       [rsi],   ymm0
      vmovapd       [rsi+32],ymm1
      vmovapd       [rsi+64],ymm2
      vmovapd       [rsi+96],ymm3
leave
ret
;--------------------------------------------------------
transpose_shuffle_4x4:
push  rbp
mov   rbp,rsp
;load matrix into the registers
      vmovapd       ymm0,[rdi]    ;  1   2   3   4
      vmovapd       ymm1,[rdi+32] ;  5   6   7   8
      vmovapd       ymm2,[rdi+64] ;  9  10  11  12
      vmovapd       ymm3,[rdi+96] ; 13  14  15  16
;shuffle
      vshufpd       ymm12,ymm0,ymm1, 0000b    ;  1   5   3   7
      vshufpd       ymm13,ymm0,ymm1, 1111b    ;  2   6   4   8
      vshufpd       ymm14,ymm2,ymm3, 0000b    ;  9  13  11  15
      vshufpd       ymm15,ymm2,ymm3, 1111b    ; 10  14  12  16
;permutate
      vperm2f128 ymm0,ymm12,ymm14, 00100000b    ; 1   5   9  13
      vperm2f128 ymm1,ymm13,ymm15, 00100000b    ; 2   6  10  14
      vperm2f128 ymm2,ymm12,ymm14, 00110001b    ; 3   7  11  15
      vperm2f128 ymm3,ymm13,ymm15, 00110001b    ; 4   8  12  16
;write to memory
      vmovapd       [rsi],   ymm0
      vmovapd       [rsi+32],ymm1
      vmovapd       [rsi+64],ymm2
      vmovapd       [rsi+96],ymm3
leave
ret
;--------------------------------------------------------
printm4x4:
section .data
      .fmt   db     "%.f",9,"%.f",9, "%.f",9,"%.f",10,0
section .text
push  rbp
mov   rbp,rsp
push  rbx                 ;callee saved
push  r15                 ;callee saved
      mov    rdi,.fmt
      mov    rcx,4
      xor    rbx,rbx      ;row counter
.loop:
      movsd  xmm0, [rsi+rbx]
      movsd  xmm1, [rsi+rbx+8]
      movsd  xmm2, [rsi+rbx+16]
      movsd  xmm3, [rsi+rbx+24]
      mov          rax,4  ;four floats
        push rcx          ;caller saved
        push rsi          ;caller saved
        push rdi          ;caller saved
        ;align stack if needed
        xor  r15,r15
        test rsp,0fh      ;last byte is 8 (not aligned)?
        setnz       r15b         ;set if not aligned
        shl  r15,3        ;multiply by 8
        sub  rsp,r15      ;substract 0 or 8
      call   printf
        add  rsp,r15      ;add 0 or 8
        pop  rdi
        pop  rsi
        pop  rcx
        add  rbx,32       ;next row
        loop .loop
pop r15
pop rbx
leave
ret
Listing 37-1

transpose4x4.asm

Figure 37-1 shows the output.
../images/483996_1_En_37_Chapter/483996_1_En_37_Fig1_HTML.jpg
Figure 37-1

transpose4x4.asm

The Unpack Version

First a remark about little-endian and packed ymm values. When in the example we have the rows 1, 2, 3, 4, then the little-endian format would be 4, 3, 2, 1. However, because ymm stores packed values in our example, ymm in SASM would look like this: 2, 1, 4, 3. You can verify this with your debugger. This can be confusing when debugging your program. In what follows we will use the little-endian format of 4, 3, 2, 1, and we will not use the 2, 1, 4, 3, format.

With the previous remarks in mind, when the matrix is loaded in the ymm registers, these registers have the following layout (the example values in parentheses):

ymm0

high qword2 (4)

low qword2 (3)

high qword1 (2)

low qword1 (1)

ymm1

high qword4 (8)

low qword4 (7)

high qword3 (6)

low qword3 (5)

...

    
The vunpcklpd instruction in the following:
      vunpcklpd ymm12,ymm0,ymm1
takes the first low quadword from operands 2 and 3 and stores them in operand 1 and then takes the second-lowest quadwords in a similar way to produce the following:

ymm12

low qword4 (7)

low qword2 (3)

low qword3 (5)

low qword1 (1)

Similarly, the instruction vunpckhpd takes the high quadwords from operands 2 and 3 and stores them in operand 1 in a similar fashion.
      vunpckhpd ymm13,ymm0,ymm1

ymm13

high qword4 (8)

high qword2 (4)

high qword3 (6)

high qword1 (2)

The purpose of this method of unpacking is to change column pairs to row pairs. For example, $$ left[egin{array}{c}1\ {}5end{array}
ight] $$ becomes [1 5].

After the unpacking, the ymm registers look as follows in little-endian format:

ymm12

7

3

5

1

ymm13

8

4

6

2

ymm14

15

11

13

9

ymm15

16

12

14

10

In human-readable format, instead of little-endian format, we have the following:

1

5

3

7

2

6

4

8

9

13

11

15

10

14

12

16

Now we have to permutate values between the rows to get the values in the correct order. In little-endian format, we need to obtain the following:

13

9

5

1

14

10

6

2

15

11

7

3

16

12

8

4

You may notice that the two lower values of ymm12 and ymm13 are in the correct place. Similarly, the two upper values of ymm14 and ymm15 are in the correct position.

We have to move the two lower values of ymm14 to the upper values of ymm12 and the two lower values of ymm15 to the upper values of ymm13.

The two upper values from ymm12 have to go to the lower values of ymm14, and we want the two upper values of ymm13 to go into the lower positions of ymm15.

The operation for doing that is called permutation . With vperm2f128, we can permutate pairs of two values (128 bits). We use a mask to control the permutation: for example, mask 00110001 means starts at the low bits. Remember in the following explanation that indexing starts at 0.
  • 01: Take the 128-byte high field from source 1 and put it at destination position 0.

  • 00: This has a special meaning; see the following explanation.

  • 11: Take the 128-byte high field from source 2 and put it at destination position 128.

  • 00: This has a special meaning; see the following explanation.

Here again we use little-endian format (4, 3, 2, 1) and do not consider the order in which these values are stored in the ymm registers.

So, in fact, the two 128-bit fields of the two sources are numbered sequentially.
  • Source 1 low field = 00

  • Source 1 high field = 01

  • Source 2 low field = 10

  • Source 2 high field = 11

Special meaning means if you set the third bit (index 3) in the mask, the destination low field will be zeroed, and if you set the seventh bit (index 7) in the mask, the destination high field will be zeroed.

The second, third, sixth, and seventh bits are not used here. In most cases, you can read a mask such as 00110001 as follows: 00110001.

This is what happens in the program:
      vperm2f128    ymm0, ymm12, ymm14, 00100000b
We have 00100000 here.
  • The lower 00 means take the ymm12 low field (5, 1) and put it in the low field of ymm0.

  • The higher 10 means take the ymm14 low field (13, 9) and put it in the high field of ymm0.

ymm12

7

3

5

1

ymm14

15

11

13

9

ymm0

13

9

5

1

Now ymm0 contains a row that is finished. Next comes the next row.
vperm2f128    ymm1, ymm13, ymm15, 00100000b
We have 00100000 here.
  • The lower 00 means take the ymm13 low field (6, 2) and put it in the low field of ymm1.

  • The higher 10 means take the ymm15 low field (14, 10) and put it in the high field of ymm1.

ymm13

8

4

6

2

ymm15

16

12

14

10

ymm1

14

10

6

2

Now ymm1 contains a row that is finished. Here’s the next one:
vperm2f128    ymm2, ymm12, ymm14, 00110001b
We have 00110001 here:
  • The lower 01 means take the ymm13 high field (7, 3) and put it in the low field of ymm2.

  • The higher 11 means take the ymm15 high field (15, 11) and put it in the high field of ymm2.

ymm12

7

3

5

1

ymm14

15

11

13

9

ymm2

15

11

7

3

Now ymm2 contains a row that is finished. Last one!
vperm2f128    ymm3, ymm13, ymm15, 00110001b
We have 00110001 here.
  • The lower 01 means take the ymm13 high field (8,4) and put it in the low field of ymm3.

  • The higher 11 means take the ymm15 high field (16,12) and put it in the high field of ymm3.

ymm13

8

4

6

2

ymm15

16

12

14

10

ymm3

16

12

8

4

And we are done permutating. All that’s left is to copy the rows from the ymm registers into the correct order in memory.

The Shuffle Version

We already used a shuffle instruction called pshufd in Chapter 33. Here we use the instruction vshufpd, which also uses a mask to control the shuffle. Don’t get confused; the instruction pshufd uses an 8-bit mask. The masks we will be using here count as only 4 bits.

Again, we are using little-endian format (remember 4, 3, 2, 1) and do not care how the packed values are stored in the ymm registers. That is the processor’s business.

Refer to the following table and the examples that follow this explanation. The two lower bits in the mask control which packed values go into the destination’s two lower positions; the two upper bits in the mask control which packed values go into the destination’s two upper positions. Bits 0 and 2 specify which value to take from source 1, and bits 1 and 3 specify which value to take from source 2.

Select from upper two values in source 2.

Select from upper two values in source 1.

Select from lower two values in source 2.

Select from lower two values in source 1.

0 = lower value of source 2

0 = lower value of source 1

0 = lower value of source 2

0 = lower value of source 1

1 = higher value of source 2

1 = higher value of source 1

1 = higher value of source 2

1 = higher value of source 1

The two lower values in each of the sources can never end up in the higher positions at the destinations, and the two higher values in each of the source can never end up in the lower positions of the destination. See Figure 37-2 for a schematic overview of a few example masks.
../images/483996_1_En_37_Chapter/483996_1_En_37_Fig2_HTML.png
Figure 37-2

Shuffle mask examples

Here is how it works in our program:
      vshufpd    ymm12,ymm0,ymm1, 0000b

ymm0

4

3

2

1

ymm1

8

7

6

5

ymm12

Low upper ymm1

7

Low upper ymm0

3

Low lower ymm1

5

Low lower ymm0

1

      vshufpd    ymm13,ymm0,ymm1, 1111b

ymm0

4

3

2

1

ymm1

8

7

6

5

ymm13

High upper ymm1

8

High upper ymm0

4

High lower ymm1

6

High lower ymm0

2

      vshufpd    ymm14,ymm2,ymm3, 0000b    

ymm2

12

11

10

9

ymm3

16

15

14

13

ymm14

Low upper ymm3

15

Low upper ymm2

11

Low lower ymm3

13

Low lower ymm2

9

Finally, here’s the last example:
vshufpd    ymm15,ymm2,ymm3, 1111b

ymm2

12

11

10

9

ymm3

16

15

14

13

ymm15

High upper ymm3

16

High upper ymm2

12

High lower ymm3

14

High lower ymm2

10

After applying the shuffle mask, we have eight pairs of values in the ymm registers. We chose the registers so that we obtained the same intermediate result as in the unpacked version. Now the pairs need to be rearranged in the right places to form the transpose. We do that in exactly the same way as in the unpack section by permutating fields (blocks) of 128 bits with vperm2f128.

Summary

In this chapter, you learned about the following:
  • That there are two ways to transpose a matrix

  • How to use shuffle, unpack, and permutate instructions

  • That there are different masks for shuffle, unpack, and permutate

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.21.93.20