The Android NDK comes with the GCC compiler (version 4.4.3 in release 7 of the NDK). As a consequence, you are able to use the C extensions the GNU Compiler Collection supports. Among the ones that are particularly interesting, as far as performance is concerned, are:
NOTE: Visit http://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html
for an exhaustive list of the GCC C extensions.
Built-in functions, sometimes referred to as intrinsics, are functions handled in a special manner by the compiler. Built-in functions are often used to allow for some constructs the language does not support, and are often inlined, that is, the compiler replaces the call with a series of instructions specific to the target and typically optimized. For example, a call to the __builtin_clz()
function would result in a CLZ instruction being generated (if the code is compiled for ARM and the CLZ instruction is available). When no optimized version of the built-in function exists, or when optimizations are turned off, the compiler simply makes a call to a function containing a generic implementation.
For example, GCC supports the following built-in functions:
Using built-in functions allows you to keep your code more generic while still taking advantage of optimizations available on some platforms.
Vector instructions are not really common in C code. However, with more and more CPUs supporting SIMD instructions, using vectors in your algorithms can accelerate your code quite significantly.
Listing 3–31 shows how you can define your own vector type using the vector_size
variable attribute and how you can add two vectors.
typedef int v4int __attribute__ ((vector_size (16))); // vector of four 4 integers (16
bytes)
void add_buffers_vectorized (int* dst, const int* src, int size)
{
v4int* dstv4int = (v4int*) dst;
const v4int* srcv4int = (v4int*) src;
int i;
for (i = 0; i < size/4; i++) {
*dstv4int++ += *srcv4int++;
}
// leftovers
if (size & 0x3) {
dst = (int*) dstv4int;
src = (int*) srcv4int;
switch (size & 0x3) {
case 3: *dst++ += *src++;
case 2: *dst++ += *src++;
case 1:
default: *dst += *src;
}
}
}
// simple implementation
void add_buffers (int* dst, const int* src, int size)
{
while (size--) {
*dst++ += *src++;
}
}
How this code will be compiled depends on whether the target supports SIMD instructions and whether the compiler is told to use these instructions. To tell the compiler to use NEON instructions, simply add the .neon suffix to the file name in Android.mk's LOCAL_SRC_FILES. Alternatively, you can define LOCAL_ARM_NEON to true
if all files need to be compiled with NEON support.
Listing 3–32 shows the resulting assembly code when the compiler does not use ARM SIMD instructions (NEON) whereas Listing 3–33 shows the use of the NEON instructions. (The add_buffers function is compiled the same way and is not shown in the second listing.) The loop is shown in bold in both listings.
00000000 <add_buffers_vectorized>:
0: e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
4: f102 0803 add.w r8, r2, #3 ; 0x3
8: ea18 0822 ands.w r8, r8, r2, asr #32
c: bf38 it cc
e: 4690 movcc r8, r2
10: b08e sub sp, #56
12: 4607 mov r7, r0
14: 468c mov ip, r1
16: ea4f 08a8 mov.w r8, r8, asr #2
1a: 9201 str r2, [sp, #4]
1c: f1b8 0f00 cmp.w r8, #0 ; 0x0
20: 4603 mov r3, r0
22: 460e mov r6, r1
24: dd2c ble.n 80 <add_buffers_vectorized+0x80>
26: 2500 movs r5, #0
28: f10d 0928 add.w r9, sp, #40 ; 0x28
2c: 462e mov r6, r5
2e: f10d 0a18 add.w sl, sp, #24 ; 0x18
32: f10d 0b08 add.w fp, sp, #8 ; 0x8
36: 197c adds r4, r7, r5
38: 3601 adds r6, #1
3a: e894 000f ldmia.w r4, {r0, r1, r2, r3}
3e: e889 000f stmia.w r9, {r0, r1, r2, r3}
42: eb0c 0305 add.w r3, ip, r5
46: 3510 adds r5, #16
48: 4546 cmp r6, r8
4a: cb0f ldmia r3!, {r0, r1, r2, r3}
4c: e88a 000f stmia.w sl, {r0, r1, r2, r3}
50: 9b0a ldr r3, [sp, #40]
52: 9a06 ldr r2, [sp, #24]
54: 4413 add r3, r2
56: 9a07 ldr r2, [sp, #28]
58: 9302 str r3, [sp, #8]
5a: 9b0b ldr r3, [sp, #44]
5c: 4413 add r3, r2
5e: 9a08 ldr r2, [sp, #32]
60: 9303 str r3, [sp, #12]
62: 9b0c ldr r3, [sp, #48]
64: 4413 add r3, r2
66: 9a09 ldr r2, [sp, #36]
68: 9304 str r3, [sp, #16]
6a: 9b0d ldr r3, [sp, #52]
6c: 4413 add r3, r2
6e: 9305 str r3, [sp, #20]
70: e89b 000f ldmia.w fp, {r0, r1, r2, r3}
74: e884 000f stmia.w r4, {r0, r1, r2, r3}
78: d1dd bne.n 36 <add_buffers_vectorized+0x36>
7a: 0136 lsls r6, r6, #4
7c: 19bb adds r3, r7, r6
7e: 4466 add r6, ip
80: 9901 ldr r1, [sp, #4]
82: f011 0203 ands.w r2, r1, #3 ; 0x3
86: d007 beq.n 98 <add_buffers_vectorized+0x98>
88: 2a02 cmp r2, #2
8a: d00f beq.n ac <add_buffers_vectorized+0xac>
8c: 2a03 cmp r2, #3
8e: d007 ;beq.n a0 <add_buffers_vectorized+0xa0>
90: 6819 ldr r1, [r3, #0]
92: 6832 ldr r2, [r6, #0]
94: 188a adds r2, r1, r2
96: 601a str r2, [r3, #0]
98: b00e add sp, #56
9a: e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
9e: 4770 bx lr
a0: 6819 ldr r1, [r3, #0]
a2: f856 2b04 ldr.w r2, [r6], #4
a6: 188a adds r2, r1, r2
a8: f843 2b04 str.w r2, [r3], #4
ac: 6819 ldr r1, [r3, #0]
ae: f856 2b04 ldr.w r2, [r6], #4
b2: 188a adds r2, r1, r2
b4: f843 2b04 str.w r2, [r3], #4
b8: e7ea b.n 90 <add_buffers_vectorized+0x90>
ba: bf00 nop
00000000 <add_buffers>:
0: b470 push {r4, r5, r6}
2: b14a cbz r2, 18 <add_buffers+0x18>
4: 2300 movs r3, #0
6: 461c mov r4, r3
8: 58c6 ldr r6, [r0, r3]
a: 3401 adds r4, #1
c: 58cd ldr r5, [r1, r3]
e: 1975 adds r5, r6, r5
10: 50c5 str r5, [r0, r3]
12: 3304 adds r3, #4
14: 4294 cmp r4, r2
16: d1f7 bne.n 8 <add_buffers+0x8>
18: bc70 pop {r4, r5, r6}
1a: 4770 bx lr
00000000 <add_buffers_vectorized>:
0: b470 push {r4, r5, r6}
2: 1cd6 adds r6, r2, #3
4: ea16 0622 ands.w r6, r6, r2, asr #32
8: bf38 it cc
a: 4616 movcc r6, r2
c: 4604 mov r4, r0
e: 460b mov r3, r1
10: 10b6 asrs r6, r6, #2
12: 2e00 cmp r6, #0
14: dd0f ble.n 36 <add_buffers_vectorized+0x36>
16: 460d mov r5, r1
18: 2300 movs r3, #0
1a: 3301 adds r3, #1
1c: ecd4 2b04 vldmia r4, {d18-d19}
20: ecf5 0b04 vldmia r5!, {d16-d17}
24: 42b3 cmp r3, r6
26: ef62 08e0 vadd.i32 q8, q9, q8
2a: ece4 0b04 vstmia r4!, {d16-d17}
2e: d1f4 bne.n 1a <add_buffers_vectorized+0x1a>
30: 011b lsls r3, r3, #4
32: 18c4 adds r4, r0, r3
34: 18cb adds r3, r1, r3
36: f012 0203 ands.w r2, r2, #3 ; 0x3
3a: d008 beq.n 4e <add_buffers_vectorized+0x4e>
3c: 2a02 cmp r2, #2
3e: 4621 mov r1, r4
40: d00d beq.n 5e <add_buffers_vectorized+0x5e>
42: 2a03 cmp r2, #3
44: d005 beq.n 52 <add_buffers_vectorized+0x52>
46: 680a ldr r2, [r1, #0]
48: 681b ldr r3, [r3, #0]
4a: 18d3 adds r3, r2, r3
4c: 600b str r3, [r1, #0]
4e: bc70 pop {r4, r5, r6}
50: 4770 bx lr
52: 6820 ldr r0, [r4, #0]
54: f853 2b04 ldr.w r2, [r3], #4
58: 1882 adds r2, r0, r2
5a: f841 2b04 str.w r2, [r1], #4
5e: 6808 ldr r0, [r1, #0]
60: f853 2b04 ldr.w r2, [r3], #4
64: 1882 adds r2, r0, r2
66: f841 2b04 str.w r2, [r1], #4
6a: e7ec b.n 46 <add_buffers_vectorized+0x46>
You can quickly see that the loop was compiled in far fewer instructions when NEON instructions are used. As a matter of fact, the vldmia
instruction loads four integers from memory, the vadd.i32
instruction performs four additions, and the vstmia
instruction stores four integers in memory. This results in more compact and more efficient code.
Using vectors is a double-edged sword though:
add_buffers
function is far simpler than its “vectorized” equivalent and results in simpler assembly code: see how many times data is read from and written to the stack in add_buffers_vectorized
when SIMD instructions are not used.)NOTE: Visit http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
for more information about vectors.
3.135.196.103