diff --git a/firmware/baseband/dsp_decimate.cpp b/firmware/baseband/dsp_decimate.cpp index 4eeb6d2b..165fbcfb 100644 --- a/firmware/baseband/dsp_decimate.cpp +++ b/firmware/baseband/dsp_decimate.cpp @@ -203,33 +203,31 @@ size_t fir_and_decimate_by_2_complex_fast( complex16_t* const dst_start, complex16_t* const z, const complex16_t* const taps, - const size_t taps_count + const size_t taps_count, + const size_t decimation_factor ) { - /* int16_t input (sample count "n" must be multiple of 4) - * -> int16_t output, decimated by 2. + /* int16_t input (sample count "n" must be multiple of decimation_factor) + * -> int16_t output, decimated by decimation_factor. * taps are normalized to 1 << 16 == 1.0. */ - auto src_p = src_start; + const auto src_p = src_start; auto dst_p = dst_start; - auto z_new_p = &z[0]; - auto t_p = &taps[taps_count * 2]; while(src_p < &src_start[src_count]) { /* Put two new samples into delay buffer */ - *__SIMD32(z_new_p)++ = *__SIMD32(src_p)++; - *__SIMD32(z_new_p)++ = *__SIMD32(src_p)++; - - t_p -= (taps_count + 2); - if( z_new_p == &z[taps_count] ) { - z_new_p = &z[0]; - t_p = &taps[taps_count]; + auto z_new_p = &z[taps_count - decimation_factor]; + for(size_t i=0; i 0) { const auto tap0 = *__SIMD32(t_p)++; const auto sample0 = *__SIMD32(z_p)++; const auto tap1 = *__SIMD32(t_p)++; @@ -265,6 +263,8 @@ size_t fir_and_decimate_by_2_complex_fast( t_imag = __SMLALDX(sample6, tap6, t_imag); t_real = __SMLSLD(sample7, tap7, t_real); t_imag = __SMLALDX(sample7, tap7, t_imag); + + loop_count--; } /* TODO: Re-evaluate whether saturation is performed, normalization, @@ -279,9 +279,30 @@ size_t fir_and_decimate_by_2_complex_fast( i_sat, 16 ); + + /* Shift sample buffer left/down by decimation factor. */ + const size_t unroll_factor = 4; + size_t shift_count = (taps_count - 1) / unroll_factor; + + auto t = &z[0]; + auto s = &z[decimation_factor]; + + while(shift_count > 0) { + *__SIMD32(t)++ = *__SIMD32(s)++; + *__SIMD32(t)++ = *__SIMD32(s)++; + *__SIMD32(t)++ = *__SIMD32(s)++; + *__SIMD32(t)++ = *__SIMD32(s)++; + shift_count--; + } + + shift_count = (taps_count - 1) % unroll_factor; + while(shift_count > 0) { + *(t++) = *(s++); + shift_count--; + } } - return src_count / 2; + return src_count / decimation_factor; } buffer_s16_t DecimateBy2CIC4Real::execute( diff --git a/firmware/baseband/dsp_decimate.hpp b/firmware/baseband/dsp_decimate.hpp index aed3dcc6..e24207c6 100644 --- a/firmware/baseband/dsp_decimate.hpp +++ b/firmware/baseband/dsp_decimate.hpp @@ -84,7 +84,8 @@ size_t fir_and_decimate_by_2_complex_fast( complex16_t* const dst_start, complex16_t* const z, const complex16_t* const taps, - const size_t taps_count + const size_t taps_count, + const size_t decimation_factor ); class FIRAndDecimateBy2Complex { @@ -100,7 +101,7 @@ public: FIRAndDecimateBy2Complex( const size_t taps_count ) : samples_ { std::make_unique(taps_count) }, - taps_reversed_ { std::make_unique(taps_count * 2) }, + taps_reversed_ { std::make_unique(taps_count) }, taps_count_ { taps_count } { } @@ -111,14 +112,13 @@ public: ) : FIRAndDecimateBy2Complex(taps.size()) { std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[0]); - std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[taps.size()]); } buffer_c16_t execute( buffer_c16_t src, buffer_c16_t dst ) { - const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_); + const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_, decimation_factor); return { dst.p, dst_count, src.sampling_rate / decimation_factor }; }