Generalize fir_and_decimate_by_2_complex_fast.

Permit decimation_factors other than 2. Permit tap counts != 64 (but still must be multiple of 8). Half the amount of tap memory required. Performance is significantly degraded due to greater flexibility -- most likely due to separate sample buffer shift phase, instead of performing shift during output sample calculation.
2025-09-21 13:34:54 -04:00 · 2015-11-03 16:17:59 -08:00 · 2015-11-03 16:17:59 -08:00 · cde15e4271
commit cde15e4271
parent be78ed657f
2 changed files with 41 additions and 20 deletions
--- a/firmware/baseband/dsp_decimate.cpp
+++ b/firmware/baseband/dsp_decimate.cpp
@ -203,33 +203,31 @@ size_t fir_and_decimate_by_2_complex_fast(
 	complex16_t* const dst_start,
 	complex16_t* const z,
 	const complex16_t* const taps,
-	const size_t taps_count
+	const size_t taps_count,
+	const size_t decimation_factor
 ) {
-	/* int16_t input (sample count "n" must be multiple of 4)
-	 * -> int16_t output, decimated by 2.
+	/* int16_t input (sample count "n" must be multiple of decimation_factor)
+	 * -> int16_t output, decimated by decimation_factor.
 	 * taps are normalized to 1 << 16 == 1.0.
 	 */
-	auto src_p = src_start;
+	const auto src_p = src_start;
 	auto dst_p = dst_start;
-	auto z_new_p = &z[0];
-	auto t_p = &taps[taps_count * 2];

 	while(src_p < &src_start[src_count]) {
 		/* Put two new samples into delay buffer */
-		*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
-		*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
-
-		t_p -= (taps_count + 2);
-		if( z_new_p == &z[taps_count] ) {
-			z_new_p = &z[0];
-			t_p = &taps[taps_count];
+		auto z_new_p = &z[taps_count - decimation_factor];
+		for(size_t i=0; i<decimation_factor; i++) {
+			*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
 		}

+		size_t loop_count = taps_count / 8;
+		auto t_p = &taps[0];
+		auto z_p = &z[0];
+
 		int64_t t_real = 0;
 		int64_t t_imag = 0;

-		auto z_p = &z[0];
-		while(z_p < &z[taps_count]) {
+		while(loop_count > 0) {
 			const auto tap0 = *__SIMD32(t_p)++;
 			const auto sample0 = *__SIMD32(z_p)++;
 			const auto tap1 = *__SIMD32(t_p)++;
@ -265,6 +263,8 @@ size_t fir_and_decimate_by_2_complex_fast(
 			t_imag = __SMLALDX(sample6, tap6, t_imag);
 			t_real = __SMLSLD(sample7, tap7, t_real);
 			t_imag = __SMLALDX(sample7, tap7, t_imag);
+
+			loop_count--;
 		}

 		/* TODO: Re-evaluate whether saturation is performed, normalization,
@ -279,9 +279,30 @@ size_t fir_and_decimate_by_2_complex_fast(
 			i_sat,
 			16
 		);
+
+		/* Shift sample buffer left/down by decimation factor. */
+		const size_t unroll_factor = 4;
+		size_t shift_count = (taps_count - 1) / unroll_factor;
+
+		auto t = &z[0];
+		auto s = &z[decimation_factor];
+		
+		while(shift_count > 0) {
+			*__SIMD32(t)++ = *__SIMD32(s)++;
+			*__SIMD32(t)++ = *__SIMD32(s)++;
+			*__SIMD32(t)++ = *__SIMD32(s)++;
+			*__SIMD32(t)++ = *__SIMD32(s)++;
+			shift_count--;
+		}
+
+		shift_count = (taps_count - 1) % unroll_factor;
+		while(shift_count > 0) {
+			*(t++) = *(s++);
+			shift_count--;
+		}
 	}

-	return src_count / 2;
+	return src_count / decimation_factor;
 }

 buffer_s16_t DecimateBy2CIC4Real::execute(
--- a/firmware/baseband/dsp_decimate.hpp
+++ b/firmware/baseband/dsp_decimate.hpp
@ -84,7 +84,8 @@ size_t fir_and_decimate_by_2_complex_fast(
 	complex16_t* const dst_start,
 	complex16_t* const z,
 	const complex16_t* const taps,
-	const size_t taps_count
+	const size_t taps_count,
+	const size_t decimation_factor
 );

 class FIRAndDecimateBy2Complex {
@ -100,7 +101,7 @@ public:
 	FIRAndDecimateBy2Complex(
 	 	const size_t taps_count
 	) : samples_ { std::make_unique<samples_t>(taps_count) },
-		taps_reversed_ { std::make_unique<taps_t>(taps_count * 2) },
+		taps_reversed_ { std::make_unique<taps_t>(taps_count) },
 		taps_count_ { taps_count }
 	{
 	}
@ -111,14 +112,13 @@ public:
 	) : FIRAndDecimateBy2Complex(taps.size())
 	{
 		std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[0]);
-		std::reverse_copy(taps.cbegin(), taps.cend(), &taps_reversed_[taps.size()]);
 	}

 	buffer_c16_t execute(
 		buffer_c16_t src,
 		buffer_c16_t dst
 	) {
-		const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_);
+		const auto dst_count = fir_and_decimate_by_2_complex_fast(src.p, src.count, dst.p, &samples_[0], &taps_reversed_[0], taps_count_, decimation_factor);
 		return { dst.p, dst_count, src.sampling_rate / decimation_factor };
 	}