9.5. ARCHITECTURE-SPECIFIC OPTIMIZATIONS 139
for(i=0; i<fir->frameSize; i+=4) {
freg1 = vld1q_f32(input); //load first four elements of input
input += 4;
vst1q_f32(windowPnt1, freg1); //store in window buffer
windowPnt1 += 4;
}
for(i=0; i<fir->frameSize; i++) {
windowPnt2 = fir->window + 1 + i; //copy pointers
coeffsPnt = fir->coefficients;
freg3 = vdupq_n_f32(0.0); // initialize accumulator to zero
for(j=0; j<fir->numCoefficients; j+=4) {
//load four elements of input
freg1 = vld1q_f32(windowPnt2);
windowPnt2 += 4;
//load four filter coefficients
freg2 = vld1q_f32(coeffsPnt);
coeffsPnt += 4;
//multiply-accumulate - freg3 = freg1*freg2+freg3
freg3 = vmlaq_f32(freg3, freg1, freg2);
}
//save output
fir->result[i] = (freg3[0] + freg3[1] + freg3[2] + freg3[3]);
}
}
e overall result is the same as the previous code versions, but now the linear convolution result
is computed with vectors containing four elements each.