mirror of
https://github.com/eried/portapack-mayhem.git
synced 2025-01-11 15:29:28 -05:00
Generalize fir_and_decimate_by_2_complex_fast.
Permit decimation_factors other than 2. Permit tap counts != 64 (but still must be multiple of 8). Half the amount of tap memory required. Performance is significantly degraded due to greater flexibility -- most likely due to separate sample buffer shift phase, instead of performing shift during output sample calculation.
This commit is contained in:
parent
be78ed657f
commit
cde15e4271
@ -203,33 +203,31 @@ size_t fir_and_decimate_by_2_complex_fast(
|
|||||||
complex16_t* const dst_start,
|
complex16_t* const dst_start,
|
||||||
complex16_t* const z,
|
complex16_t* const z,
|
||||||
const complex16_t* const taps,
|
const complex16_t* const taps,
|
||||||
const size_t taps_count
|
const size_t taps_count,
|
||||||
|
const size_t decimation_factor
|
||||||
) {
|
) {
|
||||||
/* int16_t input (sample count "n" must be multiple of 4)
|
/* int16_t input (sample count "n" must be multiple of decimation_factor)
|
||||||
* -> int16_t output, decimated by 2.
|
* -> int16_t output, decimated by decimation_factor.
|
||||||
* taps are normalized to 1 << 16 == 1.0.
|
* taps are normalized to 1 << 16 == 1.0.
|
||||||
*/
|
*/
|
||||||
auto src_p = src_start;
|
const auto src_p = src_start;
|
||||||
auto dst_p = dst_start;
|
auto dst_p = dst_start;
|
||||||
auto z_new_p = &z[0];
|
|
||||||
auto t_p = &taps[taps_count * 2];
|
|
||||||
|
|
||||||
while(src_p < &src_start[src_count]) {
|
while(src_p < &src_start[src_count]) {
|
||||||
/* Put two new samples into delay buffer */
|
/* Put two new samples into delay buffer */
|
||||||
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
|
auto z_new_p = &z[taps_count - decimation_factor];
|
||||||
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
|
for(size_t i=0; i<decimation_factor; i++) {
|
||||||
|
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
|
||||||
t_p -= (taps_count + 2);
|
|
||||||
if( z_new_p == &z[taps_count] ) {
|
|
||||||
z_new_p = &z[0];
|
|
||||||
t_p = &taps[taps_count];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t loop_count = taps_count / 8;
|
||||||
|
auto t_p = &taps[0];
|
||||||
|
auto z_p = &z[0];
|
||||||
|
|
||||||
int64_t t_real = 0;
|
int64_t t_real = 0;
|
||||||
int64_t t_imag = 0;
|
int64_t t_imag = 0;
|
||||||
|
|
||||||
auto z_p = &z[0];
|
while(loop_count > 0) {
|
||||||
while(z_p < &z[taps_count]) {
|
|
||||||
const auto tap0 = *__SIMD32(t_p)++;
|
const auto tap0 = *__SIMD32(t_p)++;
|
||||||
const auto sample0 = *__SIMD32(z_p)++;
|
const auto sample0 = *__SIMD32(z_p)++;
|
||||||
const auto tap1 = *__SIMD32(t_p)++;
|
const auto tap1 = *__SIMD32(t_p)++;
|
||||||
@ -265,6 +263,8 @@ size_t fir_and_decimate_by_2_complex_fast(
|
|||||||
t_imag = __SMLALDX(sample6, tap6, t_imag);
|
t_imag = __SMLALDX(sample6, tap6, t_imag);
|
||||||
t_real = __SMLSLD(sample7, tap7, t_real);
|
t_real = __SMLSLD(sample7, tap7, t_real);
|
||||||
t_imag = __SMLALDX(sample7, tap7, t_imag);
|
t_imag = __SMLALDX(sample7, tap7, t_imag);
|
||||||
|
|
||||||
|
loop_count--;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: Re-evaluate whether saturation is performed, normalization,
|
/* TODO: Re-evaluate whether saturation is performed, normalization,
|
||||||
@ -279,9 +279,30 @@ size_t fir_and_decimate_by_2_complex_fast(
|
|||||||
i_sat,
|
i_sat,
|
||||||
16
|
16
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* Shift sample buffer left/down by decimation factor. */
|
||||||
|
const size_t unroll_factor = 4;
|
||||||
|
size_t shift_count = (taps_count - 1) / unroll_factor;
|
||||||
|
|
||||||
|
auto t = &z[0];
|
||||||
|
auto s = &z[decimation_factor];
|
||||||
|
|
||||||
|
while(shift_count > 0) {
|
||||||
|
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||||
|
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||||
|
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||||
|
*__SIMD32(t)++ = *__SIMD32(s)++;
|
||||||
|
shift_count--;
|
||||||
|
}
|
||||||
|
|
||||||
|
shift_count = (taps_count - 1) % unroll_factor;
|
||||||
|
while(shift_count > 0) {
|
||||||
|
*(t++) = *(s++);
|
||||||
|
shift_count--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return src_count / 2;
|
return src_count / decimation_factor;
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer_s16_t DecimateBy2CIC4Real::execute(
|
buffer_s16_t DecimateBy2CIC4Real::execute(
|
||||||
|
@ -84,7 +84,8 @@ size_t fir_and_decimate_by_2_complex_fast(
|
|||||||
complex16_t* const dst_start,
|
complex16_t* const dst_start,
|
||||||
complex16_t* const z,
|
complex16_t* const z,
|
||||||
const complex16_t* const taps,
|
const complex16_t* const taps,
|
||||||
const size_t taps_count
|
const size_t taps_count,
|
||||||
|
const size_t decimation_factor
|
||||||
);
|
);
|
||||||
|
|
||||||
class FIRAndDecimateBy2Complex {
|
class FIRAndDecimateBy2Complex {
|
||||||
@ -100,7 +101,7 @@ public:
|
|||||||
FIRAndDecimateBy2Complex(
|
FIRAndDecimateBy2Complex(
|
||||||
const size_t taps_count
|
const size_t taps_count
|
||||||
) : samples_ { std::make_unique<samples_t>(taps_count) },
|
) : samples_ { std::make_unique<samples_t>(taps_count) },
|
||||||
taps_reversed_ { std::make_unique<taps_t>(taps_count * 2) },
|
taps_reversed_ { std::make_unique<taps_t>(taps_count) },
|
||||||
taps_count_ { taps_count }
|
taps_count_ { taps_count }
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@ -111,14 +112,13 @@ public:
|
|||||||
) : FIRAndDecimateBy2Complex(taps.size())
|
) : FIRAndDecimateBy2Complex(taps.size())
|
||||||
{
|
{
|
||||||
std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[0]);
|
std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[0]);
|
||||||
std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[taps.size()]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer_c16_t execute(
|
buffer_c16_t execute(
|
||||||
buffer_c16_t src,
|
buffer_c16_t src,
|
||||||
buffer_c16_t dst
|
buffer_c16_t dst
|
||||||
) {
|
) {
|
||||||
const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_);
|
const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_, decimation_factor);
|
||||||
return { dst.p, dst_count, src.sampling_rate / decimation_factor };
|
return { dst.p, dst_count, src.sampling_rate / decimation_factor };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user