Generalize fir_and_decimate_by_2_complex_fast.

Permit decimation_factors other than 2.
Permit tap counts != 64 (but still must be multiple of 8).
Half the amount of tap memory required.
Performance is significantly degraded due to greater flexibility -- most likely due to separate sample buffer shift phase, instead of performing shift during output sample calculation.
This commit is contained in:
Jared Boone 2015-11-03 16:17:59 -08:00
parent be78ed657f
commit cde15e4271
2 changed files with 41 additions and 20 deletions

View File

@ -203,33 +203,31 @@ size_t fir_and_decimate_by_2_complex_fast(
complex16_t* const dst_start,
complex16_t* const z,
const complex16_t* const taps,
const size_t taps_count
const size_t taps_count,
const size_t decimation_factor
) {
/* int16_t input (sample count "n" must be multiple of 4)
* -> int16_t output, decimated by 2.
/* int16_t input (sample count "n" must be multiple of decimation_factor)
* -> int16_t output, decimated by decimation_factor.
* taps are normalized to 1 << 16 == 1.0.
*/
auto src_p = src_start;
const auto src_p = src_start;
auto dst_p = dst_start;
auto z_new_p = &z[0];
auto t_p = &taps[taps_count * 2];
while(src_p < &src_start[src_count]) {
/* Put two new samples into delay buffer */
auto z_new_p = &z[taps_count - decimation_factor];
for(size_t i=0; i<decimation_factor; i++) {
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
t_p -= (taps_count + 2);
if( z_new_p == &z[taps_count] ) {
z_new_p = &z[0];
t_p = &taps[taps_count];
}
size_t loop_count = taps_count / 8;
auto t_p = &taps[0];
auto z_p = &z[0];
int64_t t_real = 0;
int64_t t_imag = 0;
auto z_p = &z[0];
while(z_p < &z[taps_count]) {
while(loop_count > 0) {
const auto tap0 = *__SIMD32(t_p)++;
const auto sample0 = *__SIMD32(z_p)++;
const auto tap1 = *__SIMD32(t_p)++;
@ -265,6 +263,8 @@ size_t fir_and_decimate_by_2_complex_fast(
t_imag = __SMLALDX(sample6, tap6, t_imag);
t_real = __SMLSLD(sample7, tap7, t_real);
t_imag = __SMLALDX(sample7, tap7, t_imag);
loop_count--;
}
/* TODO: Re-evaluate whether saturation is performed, normalization,
@ -279,9 +279,30 @@ size_t fir_and_decimate_by_2_complex_fast(
i_sat,
16
);
/* Shift sample buffer left/down by decimation factor. */
const size_t unroll_factor = 4;
size_t shift_count = (taps_count - 1) / unroll_factor;
auto t = &z[0];
auto s = &z[decimation_factor];
while(shift_count > 0) {
*__SIMD32(t)++ = *__SIMD32(s)++;
*__SIMD32(t)++ = *__SIMD32(s)++;
*__SIMD32(t)++ = *__SIMD32(s)++;
*__SIMD32(t)++ = *__SIMD32(s)++;
shift_count--;
}
return src_count / 2;
shift_count = (taps_count - 1) % unroll_factor;
while(shift_count > 0) {
*(t++) = *(s++);
shift_count--;
}
}
return src_count / decimation_factor;
}
buffer_s16_t DecimateBy2CIC4Real::execute(

View File

@ -84,7 +84,8 @@ size_t fir_and_decimate_by_2_complex_fast(
complex16_t* const dst_start,
complex16_t* const z,
const complex16_t* const taps,
const size_t taps_count
const size_t taps_count,
const size_t decimation_factor
);
class FIRAndDecimateBy2Complex {
@ -100,7 +101,7 @@ public:
FIRAndDecimateBy2Complex(
const size_t taps_count
) : samples_ { std::make_unique<samples_t>(taps_count) },
taps_reversed_ { std::make_unique<taps_t>(taps_count * 2) },
taps_reversed_ { std::make_unique<taps_t>(taps_count) },
taps_count_ { taps_count }
{
}
@ -111,14 +112,13 @@ public:
) : FIRAndDecimateBy2Complex(taps.size())
{
std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[0]);
std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[taps.size()]);
}
buffer_c16_t execute(
buffer_c16_t src,
buffer_c16_t dst
) {
const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_);
const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_, decimation_factor);
return { dst.p, dst_count, src.sampling_rate / decimation_factor };
}