mirror of
https://github.com/eried/portapack-mayhem.git
synced 2025-01-25 14:08:23 -05:00
Generalize fir_and_decimate_by_2_complex_fast.
Permit decimation_factors other than 2. Permit tap counts != 64 (but still must be multiple of 8). Half the amount of tap memory required. Performance is significantly degraded due to greater flexibility -- most likely due to separate sample buffer shift phase, instead of performing shift during output sample calculation.
This commit is contained in:
parent
be78ed657f
commit
cde15e4271
@ -203,33 +203,31 @@ size_t fir_and_decimate_by_2_complex_fast(
|
||||
complex16_t* const dst_start,
|
||||
complex16_t* const z,
|
||||
const complex16_t* const taps,
|
||||
const size_t taps_count
|
||||
const size_t taps_count,
|
||||
const size_t decimation_factor
|
||||
) {
|
||||
/* int16_t input (sample count "n" must be multiple of 4)
|
||||
* -> int16_t output, decimated by 2.
|
||||
/* int16_t input (sample count "n" must be multiple of decimation_factor)
|
||||
* -> int16_t output, decimated by decimation_factor.
|
||||
* taps are normalized to 1 << 16 == 1.0.
|
||||
*/
|
||||
auto src_p = src_start;
|
||||
const auto src_p = src_start;
|
||||
auto dst_p = dst_start;
|
||||
auto z_new_p = &z[0];
|
||||
auto t_p = &taps[taps_count * 2];
|
||||
|
||||
while(src_p < &src_start[src_count]) {
|
||||
/* Put two new samples into delay buffer */
|
||||
auto z_new_p = &z[taps_count - decimation_factor];
|
||||
for(size_t i=0; i<decimation_factor; i++) {
|
||||
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
|
||||
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
|
||||
|
||||
t_p -= (taps_count + 2);
|
||||
if( z_new_p == &z[taps_count] ) {
|
||||
z_new_p = &z[0];
|
||||
t_p = &taps[taps_count];
|
||||
}
|
||||
|
||||
size_t loop_count = taps_count / 8;
|
||||
auto t_p = &taps[0];
|
||||
auto z_p = &z[0];
|
||||
|
||||
int64_t t_real = 0;
|
||||
int64_t t_imag = 0;
|
||||
|
||||
auto z_p = &z[0];
|
||||
while(z_p < &z[taps_count]) {
|
||||
while(loop_count > 0) {
|
||||
const auto tap0 = *__SIMD32(t_p)++;
|
||||
const auto sample0 = *__SIMD32(z_p)++;
|
||||
const auto tap1 = *__SIMD32(t_p)++;
|
||||
@ -265,6 +263,8 @@ size_t fir_and_decimate_by_2_complex_fast(
|
||||
t_imag = __SMLALDX(sample6, tap6, t_imag);
|
||||
t_real = __SMLSLD(sample7, tap7, t_real);
|
||||
t_imag = __SMLALDX(sample7, tap7, t_imag);
|
||||
|
||||
loop_count--;
|
||||
}
|
||||
|
||||
/* TODO: Re-evaluate whether saturation is performed, normalization,
|
||||
@ -279,9 +279,30 @@ size_t fir_and_decimate_by_2_complex_fast(
|
||||
i_sat,
|
||||
16
|
||||
);
|
||||
|
||||
/* Shift sample buffer left/down by decimation factor. */
|
||||
const size_t unroll_factor = 4;
|
||||
size_t shift_count = (taps_count - 1) / unroll_factor;
|
||||
|
||||
auto t = &z[0];
|
||||
auto s = &z[decimation_factor];
|
||||
|
||||
while(shift_count > 0) {
|
||||
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||
shift_count--;
|
||||
}
|
||||
|
||||
return src_count / 2;
|
||||
shift_count = (taps_count - 1) % unroll_factor;
|
||||
while(shift_count > 0) {
|
||||
*(t++) = *(s++);
|
||||
shift_count--;
|
||||
}
|
||||
}
|
||||
|
||||
return src_count / decimation_factor;
|
||||
}
|
||||
|
||||
buffer_s16_t DecimateBy2CIC4Real::execute(
|
||||
|
@ -84,7 +84,8 @@ size_t fir_and_decimate_by_2_complex_fast(
|
||||
complex16_t* const dst_start,
|
||||
complex16_t* const z,
|
||||
const complex16_t* const taps,
|
||||
const size_t taps_count
|
||||
const size_t taps_count,
|
||||
const size_t decimation_factor
|
||||
);
|
||||
|
||||
class FIRAndDecimateBy2Complex {
|
||||
@ -100,7 +101,7 @@ public:
|
||||
FIRAndDecimateBy2Complex(
|
||||
const size_t taps_count
|
||||
) : samples_ { std::make_unique<samples_t>(taps_count) },
|
||||
taps_reversed_ { std::make_unique<taps_t>(taps_count * 2) },
|
||||
taps_reversed_ { std::make_unique<taps_t>(taps_count) },
|
||||
taps_count_ { taps_count }
|
||||
{
|
||||
}
|
||||
@ -111,14 +112,13 @@ public:
|
||||
) : FIRAndDecimateBy2Complex(taps.size())
|
||||
{
|
||||
std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[0]);
|
||||
std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[taps.size()]);
|
||||
}
|
||||
|
||||
buffer_c16_t execute(
|
||||
buffer_c16_t src,
|
||||
buffer_c16_t dst
|
||||
) {
|
||||
const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_);
|
||||
const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_, decimation_factor);
|
||||
return { dst.p, dst_count, src.sampling_rate / decimation_factor };
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user