© Jo Van Hoey 2019
J. Van HoeyBeginning x64 Assembly Programminghttps://doi.org/10.1007/978-1-4842-5076-1_34

34. SSE String Masks

Jo Van Hoey1 
(1)
Hamme, Belgium
 

Now that we know how to shuffle, we can discuss string masks.

Remember that SSE provides two string manipulation instructions that use a mask: pcmpistrm and pcmpestrm. We will be using implicit length instructions. At first, using masks looks complicated, but once you get the hang of it, you will see how powerful masking can be.

Searching for Characters

Listing 34-1, Listing 34-4, and Listing 34-3 show the example.
; sse_string4.asm
; find a character
extern print16b
extern printf
section .data
      string1      db    "qdacdekkfijlmdoza"
                   db    "becdfgdklkmdddaf"
                   db    "fffffffdedeee",10,0
      string2      db    "e",0
      string3      db    "a",0
      fmt          db    "Find all the characters '%s' "
                   db    "and '%s' in:",10,0
      fmt_oc       db    "I found %ld characters '%s'"
                   db    "and '%s'",10,0
      NL           db    10,0
section .bss
section .text
      global main
main:
push  rbp
mov   rbp,rsp
;print the search characters
      mov    rdi, fmt
      mov    rsi, string2
      mov    rdx, string3
      xor    rax,rax
      call   printf
;print the target string
      mov    rdi, string1
      xor    rax,rax
      call   printf
; search the string and print mask
      mov    rdi, string1
      mov    rsi, string2
      mov    rdx, string3
      call   pcharsrch
;print the number of occurences of string2    
      mov    rdi, fmt_oc
      mov    rsi, rax
      mov    rdx, string2
      mov    rcx, string3
      call   printf
; exit
leave
ret
;-------------------------------------------------------------
;function searching for and printing the mask
pcharsrch:           ;packed character search
push  rbp
mov   rbp,rsp
      sub    rsp,16      ;provide stack space for pushing xmm1
      xor    r12,r12     ;for the running total of occurrences
      xor    rcx,rcx     ;for signaling the end
      xor    rbx,rbx     ;for address calculation
      mov    rax,-16     ;for counting bytes, avoid flag setting
;build xmm1, load the search character
      pxor   xmm1,xmm1    ; clear xmm1
      pinsrb       xmm1,byte[rsi],0    ; first char at index 0
      pinsrb       xmm1,byte[rdx],1    ;second char at index 1
.loop:
      add          rax,16       ;avoid ZF flag setting
      mov          rsi,16       ;if no terminating 0, print 16 bytes
      movdqu       xmm2,[rdi+rbx]      ;load 16 bytes of the string in xmm2
      pcmpistrm xmm1,xmm2,40h   ;'equal each' and 'byte mask in xmm0'
      setz   cl                 ;if terminating 0 detected
;if terminating 0 found, determine position
      cmp    cl,0
      je     .gotoprint    ;no terminating 0 found
      ;terminating null found
      ;less than 16 bytes left
      ;rdi contains address of string
      ;rbx contains #bytes in blocks handled so far
      add    rdi,rbx            ;address of remaining part of string
      push   rcx                ;caller saved (cl in use)
      call   pstrlen            ;rax returns the length
      pop    rcx                ;caller saved
      dec    rax                ;length without 0
      mov    rsi,rax            ;length of remaining mask bytes
;print the mask
.gotoprint:
      call print_mask
;keep running total of matches
      popcnt        r13d,r13d   ;count the number of 1 bits
      add           r12d,r13d   ;keep the number of occurences in r12d
      or            cl,cl       ; terminating 0 detected?
      jnz           .exit
      add           rbx,16      ;preprare for the next 16 bytes
      jmp           .loop
.exit:
      mov    rdi, NL        ;add a newline
      call   printf
      mov    rax,r12        ;number of occurences
leave
ret
;-------------------------------------------------------------
;function for finding the terminating 0
pstrlen:
push  rbp
mov   rbp,rsp
      sub           rsp,16        ;for saving xmm0
      movdqu        [rbp-16],xmm0 ;push xmm0
      mov           rax, -16      ;avoid flag setting later
      pxor          xmm0, xmm0    ;search for 0 (end of string)
.loop:  add  rax, 16              ;avoid setting ZF
      pcmpistri     xmm0, [rdi + rax], 0x08 ;'equal each'
      jnz           .loop         ;0 found?
      add           rax, rcx      ;rax = bytes already handled
                                  ;rcx = bytes handled in terminating loop
      movdqu        xmm0,[rbp-16] ;pop xmm0
leave
ret
;-------------------------------------------------------------
;function for printing the mask
;xmm0 contains the mask
;rsi contains the number of bits to print (16 or less)
print_mask:
push  rbp
mov   rbp,rsp
      sub    rsp,16                ;for saving xmm0
      call   reverse_xmm0 ;little endian
      pmovmskb      r13d,xmm0     ;mov byte mask to r13d
      movdqu        [rbp-16],xmm1 ;push xmm1 because of printf
      push   rdi         ;rdi contains string1
      mov    edi,r13d    ;contains mask to be printed
      push   rdx         ;contains the mask
      push   rcx         ;contains end of string flag
      call   print16b
      pop    rcx
      pop    rdx
      pop    rdi
      movdqu xmm1,[rbp-16] ;pop xmm1
leave
ret
;-------------------------------------------------------------
;function for reversing, shuffling xmm0
reverse_xmm0:
section .data
;mask for reversing
        .bytereverse db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
section .text
push  rbp
mov   rbp,rsp
      sub     rsp,16
      movdqu [rbp-16],xmm2
      movdqu xmm2,[.bytereverse]        ;load the mask in xmm2
      pshufb xmm0,xmm2                  ;do the shuffle
      movdqu xmm2,[rbp-16]              ;pop xmm2
leave                             ;returns the shuffled xmm0
ret
Listing 34-1

string4.asm

// print16b.c
#include <stdio.h>
#include <string.h>
void print16b(long long n, int length){
      long long s,c;
      int i=0;
      for (c = 15; c >= 16-length; c--)
      {
            s = n >> c;
            if (s & 1)
                  printf("1");
            else
                  printf("0");
      }
}
Listing 34-2

print16b.c

sse_string4: sse_string4.o print16b.o
      gcc -o sse_string4 sse_string4.o print16b.o -no-pie
sse_string4.o: sse_string4.asm
      nasm -f elf64 -g -F dwarf sse_string4.asm -l sse_string4.lst
printb: print16b.c
      gcc -c print16b.c
Listing 34-3

makefile

The main part of the program is quite simple, but as with the previous examples, the program is complicated by the fact that we want to print some result on the screen. We could have avoided the printing parts and used a debugger to study the results in the registers and memory. But coping with the challenges of printing is fun, right?

Figure 34-1 shows the output.
../images/483996_1_En_34_Chapter/483996_1_En_34_Fig1_HTML.jpg
Figure 34-1

sse_string4.asm output

In our example program, we are going to search for two characters in a string. We provide a string, aptly called string1, and we look for the character 'e', which we stored in string2, and the character 'a', stored in string3.

We use a number of functions. Let’s first discuss the function reverse_xmm0 . This function takes xmm0 as an argument and reverses the order of the bytes using a shuffle. By doing so, we will be able to print xmm0 starting with the least significant bytes first and thus print in little-endian format. That is why we presented shuffling in the previous chapter.

We also have a function to measure the length of a string: pstrln . We need this because we will be reading 16-byte blocks. The last block will probably not contain 16-bytes, so for the last block, we need to determine the position of the terminating 0. This will help us to print a mask that has the same length as the string.

Our custom function pcharsrch, which takes the three strings as arguments, is where the action takes place. In the function we first do some housekeeping such as initializing registers. Register xmm1 will be used as a mask; we store the characters to search for in xmm1 with the instruction pinsrb (packed insert bytes). Then we start looping, copying each time 16 bytes of string1 in xmm2, in search of our character, or the terminating null. We use the masking instruction pcmpistrm (packed compare implicit length string with a mask). The pcmpistrm instruction takes as a third operand an immediate control byte specifying what to do, in this case “equal any” and a “byte mask in xmm0.” So, we will be looking for “any” character that “equals” our search strings. For every matching character in xmm2, the bit in xmm0 that corresponds to the position of the matching character in xmm2 will be set to 1. The pcmpistrm instruction does not have xmm0 as an operand, but it is used implicitly. The return mask will always be kept in xmm0.

The difference with pcmistri is that pcmistri would return an index of 1, matching the position in ecx. But pcmpistrm will return all matching positions in xmm0 for the 16-byte block. That allows you to drastically cut down on the number of steps to execute in order to find all matches.

You can use a bit mask or a byte mask for xmm0 (set or clear bit 6 in the control byte). We used a byte mask so that you can read the xmm0 register more easily with a debugger, two ffs in xmm0 indicate a byte with all the bits set to 1.

After the first 16-byte block is investigated, we verify whether we have found a terminating 0 and store the result of the verification in cl for later use. We want to print the mask stored in xmm0 with the function print_mask. In the debugger, notice that the byte mask is reversed in xmm0, because of the little-endian format. So, before printing, we have to reverse it; that is what we do in our function reverse_xmm0. Then we call our C function print16b to print the reversed mask. However, we cannot provide xmm0 as an argument to print16b, because under the covers print16b is using printf, and printf will interpret xmm0 as a floating-point value, not a byte mask. So, before calling print16b, we transfer the bit mask in xmm0 to r13d, with the instruction pmovmksb (which means “move byte mask”). We will use r13d later for counting; for printing we copy it to edi. We store xmm1 on the stack for later use.

We call the C function print16b to print the mask. This function takes edi (the mask) and rsi (length, passed from the caller) as arguments.

Upon returning to pcharsrch, we count the number of 1s in r13d with the instruction popcnt and update the counter in r12d. We also determine whether we have to exit the loop because a terminating null was detected in the block of bytes.

Before calling print_mask, when a terminating 0 is found, the relevant length of the last block is determined with the function pstrlen. The start address of that block is determined by adding rbx, containing the already screened bytes from previous blocks, to rdi, the address of string1. The string length, returned in rax, is used to compute the number of remaining mask bytes in xmm0 that are passed in rsi to print.

Isn’t printing a lot of fun?

Don’t be overwhelmed by the printing stuff. Concentrate first on how masks work, which is the main purpose of this chapter.

What can we do with a mask returned by pcmpistrm? Well, the resulting mask can be used, for example, to count all the occurrences of a search argument or to find all occurrences and replace them with something else, creating your own find-and-replace functionality.

Now let’s look at another search.

Searching for a Range of Characters

A range can be any number of characters to search for, e.g., all uppercase characters, all characters between a and k, all characters that represent digits, and so on.

Listing 34-4 shows how to search a string for uppercase characters.
; sse_string5.asm
; find a range of characters
extern print16b
extern printf
section .data
      string1      db    "eeAecdkkFijlmeoZa"
                   db    "bcefgeKlkmeDad"
                   db    "fdsafadfaseeE",10,0
      startrange   db    "A",10,0      ;look for uppercase
      stoprange    db    "Z",10,0
      NL           db    10,0
      fmt          db    "Find the uppercase letters in:",10,0
      fmt_oc       db    "I found %ld uppercase letters",10,0
section .bss
section .text
      global main
main:
push  rbp
mov   rbp,rsp
;first print the  string
      mov    rdi, fmt        ;title
      xor    rax,rax
      call   printf
      mov    rdi, string1    ;string
      xor    rax,rax
      call   printf
; search the string
      mov    rdi, string1
      mov    rsi, startrange
      mov    rdx, stoprange
      call   prangesrch
; print the number of occurences
      mov    rdi, fmt_oc
      mov    rsi, rax
      xor    rax,   rax
      call   printf
leave
ret
;-------------------------------------------------------------
;function searching for and printing the mask
prangesrch:             ;packed range search
push  rbp
mov   rbp,rsp
      sub    rsp,16     ;room for pushing xmm1
      xor    r12,r12    ;for the number of occurences
      xor    rcx,rcx    ;for signaling the end
      xor    rbx,rbx    ;for address calculation
      mov    rax,-16    ;avoid ZF flag setting
;build xmm1
      pxor         xmm1,xmm1  ; make sure everything is cleared
      pinsrb       xmm1,byte[rsi],0 ;startrange at index 0
      pinsrb       xmm1,byte[rdx],1 ;stoprange at index 1
.loop:
      add          rax,16
      mov          rsi,16 ;if no terminating 0, print 16 bytes
      movdqu       xmm2,[rdi+rbx]
      pcmpistrm    xmm1,xmm2,01000100b ; equal each|byte mask in xmm0
      setz         cl        ;terminating 0 detected
;if terminating 0 found, determine position
      cmp    cl,0
      je     .gotoprint    ;no terminating 0 found
      ;terminating null found
      ;less than 16 bytes left
      ;rdi contains address of string
      ;rbx contains #bytes in blocks handled so far
      add    rdi,rbx            ;take only the tail of the string
      push   rcx                ;caller saved (cl in use)
      call   pstrlen            ;determine the position of the 0
      pop    rcx                ;caller saved
      dec    rax                ;length without 0
      mov    rsi,rax            ;bytes in tail
;print the mask
.gotoprint:
      call print_mask
;keep running total of matches
      popcnt r13d, r13d    ;count the number of 1 bits
      add    r12d, r13d    ;keep the number of occurences in r12
      or     cl,cl         ;terminating 0 detected?
      jnz    .exit
      add    rbx,16        ;prepare for next block
      jmp   .loop
.exit:
      mov    rdi, NL
      call   printf
      mov    rax, r12 ;return the number of occurences
leave
ret
;-------------------------------------------------------------
pstrlen:
push  rbp
mov   rbp,rsp
      sub           rsp,16        ;for pushing xmm0
      movdqu        [rbp-16],xmm0 ;push xmm0
      mov       rax, -16          ;avoid ZF flag setting later
      pxor      xmm0, xmm0        ;search for 0 (end of string)
.loop:
      add    rax, 16      ; avoid setting ZF when rax = 0 after pcmpistri
      pcmpistri     xmm0, [rdi + rax], 0x08 ;'equal each'
      jnz           .loop        ;0 found?
      add        rax, rcx        ;rax = bytes already handled
                                 ;rcx = bytes handled in terminating loop
      movdqu         xmm0,[rbp-16]      ;pop xmm0
leave
ret
;-------------------------------------------------------------
;function for printing the mask
;xmm0 contains the mask
;rsi contains the number of bits to print (16 or less)
print_mask:
push  rbp
mov   rbp,rsp
      sub          rsp,16             ;for saving xmm0
      call         reverse_xmm0       ;little endian
      pmovmskb     r13d,xmm0          ;mov byte mask to r13d
      movdqu       [rbp-16],xmm1      ;push xmm1 because of printf
      push         rdi                ;rdi contains string1
      mov          edi, r13d          ;contains mask to be printed
      push         rdx                ;contains the mask
      push         rcx                ;contains end of string flag
      call         print16b
      pop          rcx
      pop          rdx
      pop          rdi
      movdqu       xmm1,[rbp-16] ;pop xmm1
leave
ret
;-------------------------------------------------------------
;function for reversing, shuffling xmm0
reverse_xmm0:
section .data
;mask for reversing
        .bytereverse db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
section .text
push  rbp
mov   rbp,rsp
      sub     rsp,16
      movdqu [rbp-16],xmm2
      movdqu xmm2,[.bytereverse]        ;load the mask in xmm2
      pshufb xmm0,xmm2                  ;do the shuffle
      movdqu xmm2,[rbp-16]              ;pop xmm2
leave                              ;returns the shuffled xmm0
ret
Listing 34-4

string5.asm

This program is almost entirely the same as the previous one; we just gave string2 and string3 more meaningful names. Most important, we changed the control byte that is handed to pcmpistrm to 01000100b, which means “equal range” and “mask byte in xmm0.”

The print handling is the same as in the previous section.

Figure 34-2 shows the output.
../images/483996_1_En_34_Chapter/483996_1_En_34_Fig2_HTML.jpg
Figure 34-2

sse_string5.asm output

Let’s see one more example.

Searching for a Substring

Listing 34-5 shows the code.
; sse_string6.asm
; find a substring
extern print16b
extern printf
section .data
      string1      db    "a quick pink dinosour jumps over the "
                   db    "lazy river and the lazy dinosour "
                   db    "doesn't mind",10,0
      string2      db    "dinosour",0
      NL           db    10,0
      fmt          db    "Find the substring '%s' in:",10,0
      fmt_oc       db    "I found %ld %ss",10,0
section .bss
section .text
      global main
main:
push  rbp
mov   rbp,rsp
;first print the strings
      mov    rdi, fmt
      mov    rsi, string2
      xor    rax,rax
      call   printf
      mov    rdi, string1
      xor    rax,rax
      call   printf
; search the string
      mov    rdi, string1
      mov    rsi, string2
      call   psubstringsrch
;print the number of occurences of the substring
      mov    rdi, fmt_oc
      mov    rsi, rax
      mov    rdx, string2
      call   printf
leave
ret
;-------------------------------------------------------------
;function searching substringand printing the mask
psubstringsrch:         ;packed substring search
push  rbp
mov   rbp,rsp
      sub    rsp,16     ;room for saving xmm1
      xor    r12,r12    ;running total of occurences
      xor    rcx,rcx    ;for signaling the end
      xor    rbx,rbx    ;for address calculation
      mov    rax,-16    ;avoid ZF flag setting
;build xmm1, load substring
      pxor   xmm1,xmm1
      movdqu xmm1,[rsi]
.loop:
      add    rax,16    ; avoid ZF flag setting
      mov    rsi,16    ;if no 0, print 16 bytes
      movdqu xmm2,[rdi+rbx]
      pcmpistrm xmm1,xmm2,01001100b ;'equal ordered'|'byte mask in xmm0'
      setz   cl   ; terminating 0 detected
;if terminating 0 found, determine position
      cmp    cl,0
      je     .gotoprint    ; no terminating 0 found
      ;terminating null found
      ;less than 16 bytes left
      ;rdi contains address of string
      ;rbx contains #bytes in blocks handled so far
      add    rdi,rbx            ;take only the tail of the string
      push   rcx                ;caller saved (cl in use)
      call   pstrlen            ;rax returns the position of the 0
      push   rcx                ;caller saved (cl in use)
      dec    rax                ;length without 0
      mov    rsi,rax            ;length of remaining bytes
;print the mask
.gotoprint:
      call print_mask
;keep running total of matches
      popcnt r13d,r13d    ;count the number of 1 bits
      add    r12d,r13d    ;keep the number of occurences in r12
      or     cl,cl        ;terminating 0 detected?
      jnz    .exit
      add    rbx,16       ;prepare for next block
      jmp    .loop
.exit:
      mov    rdi, NL
      call   printf
      mov    rax, r12 ;return the number of occurences
leave
ret
;-------------------------------------------------------------
pstrlen:
push  rbp
mov   rbp,rsp
      sub    rsp,16        ;for pushing xmm0
      movdqu [rbp-16],xmm0 ;push xmm0
      mov    rax, -16            ;avoid ZF flag setting later
      pxor   xmm0, xmm0    ;search for 0 (end of string)
.loop:
      add    rax, 16       ; avoid setting ZF when rax = 0 after pcmpistri
      pcmpistri    xmm0, [rdi + rax], 0x08 ;'equal each'
      jnz    .loop         ;0 found?
      add    rax, rcx      ;rax = bytes already handled
                           ;rcx = bytes handled in terminating loop
      movdqu xmm0,[rbp-16] ;pop xmm0
leave
ret
;-------------------------------------------------------------
;function for printing the mask
;xmm0 contains the mask
;rsi contains the number of bits to print (16 or less)
print_mask:
push  rbp
mov   rbp,rsp
      sub          rsp,16        ;for saving xmm0
      call         reverse_xmm0  ;little endian
      pmovmskb     r13d,xmm0     ;mov byte mask to edx
      movdqu       [rbp-16],xmm1 ;push xmm1 because of printf
      push         rdi           ;rdi contains string1
      mov          edi,r13d      ;contains mask to be printed
      push         rdx           ;contains the mask
      push         rcx           ;contains end of string flag
      call         print16b
      pop          rcx
      pop          rdx
      pop          rdi
      movdqu       xmm1,[rbp-16] ;pop xmm1
leave
ret
;-------------------------------------------------------------
;function for reversing, shuffling xmm0
reverse_xmm0:
section .data
;mask for reversing
        .bytereverse db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
section .text
push  rbp
mov   rbp,rsp
      sub     rsp,16
      movdqu [rbp-16],xmm2
      movdqu xmm2,[.bytereverse] ;load the mask in xmm2
      pshufb xmm0,xmm2           ;do the shuffle
      movdqu xmm2,[rbp-16]       ;pop xmm2
leave                            ;returns the shuffled xmm0
ret
Listing 34-5

string6.asm

We used almost the same code as before; we only changed the strings, and the control byte contains “equal ordered” and “byte mask in xmm0.” Pretty easy, isn't it?

Figure 34-3 shows the output.
../images/483996_1_En_34_Chapter/483996_1_En_34_Fig3_HTML.jpg
Figure 34-3

sse_string6.asm output

Summary

In this chapter, you learned about the following:
  • Using string masks

  • Searching for characters, ranges, and substrings

  • Printing masks from xmm registers

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.148.104.242