From a02bfe55d0b9fbcac2a35006855b1f5a863c0c4a Mon Sep 17 00:00:00 2001
From: Jared Boone <jboone@earfeast.com>
Date: Sat, 2 Jan 2016 10:34:17 -0800
Subject: [PATCH] Decimators for wider IFIR output bandwidth.

---
 firmware/baseband/dsp_decimate.cpp | 119 +++++++++++++++++++++++++++++
 firmware/baseband/dsp_decimate.hpp |  58 ++++++++++++++
 2 files changed, 177 insertions(+)
diff --git a/firmware/baseband/dsp_decimate.cpp b/firmware/baseband/dsp_decimate.cpp
index 5698b33d..c8a70898 100644
--- a/firmware/baseband/dsp_decimate.cpp
+++ b/firmware/baseband/dsp_decimate.cpp
@@ -177,6 +177,71 @@ static inline uint32_t scale_round_and_pack(
 	return __PKHBT(saturated_real, saturated_imag, 16);
 }
 
+// FIRC8xR16x24FS4Decim4 //////////////////////////////////////////////////
+
+FIRC8xR16x24FS4Decim4::FIRC8xR16x24FS4Decim4() {
+	z_.fill({});
+}
+
+void FIRC8xR16x24FS4Decim4::configure(
+	const std::array<tap_t, taps_count>& taps,
+	const int32_t scale,
+	const Shift shift
+) {
+	const int negate_factor = (shift == Shift::Up) ? -1 : 1;
+	for(size_t i=0; i<taps.size(); i+=4) {
+		taps_[i+0] =  taps[i+0];
+		taps_[i+1] =  taps[i+1] * negate_factor;
+		taps_[i+2] = -taps[i+2];
+		taps_[i+3] =  taps[i+3] * negate_factor;
+	}
+	output_scale = scale;
+}
+
+buffer_c16_t FIRC8xR16x24FS4Decim4::execute(
+	buffer_c8_t src,
+	buffer_c16_t dst
+) {
+	vec2_s16* const z = static_cast<vec2_s16*>(__builtin_assume_aligned(z_.data(), 4));
+	const vec2_s16* const t = static_cast<vec2_s16*>(__builtin_assume_aligned(taps_.data(), 4));
+	uint32_t* const d = static_cast<uint32_t*>(__builtin_assume_aligned(dst.p, 4));
+
+	const auto k = output_scale;
+
+	const size_t count = src.count / decimation_factor;
+	for(size_t i=0; i<count; i++) {
+		const vec4_s8* const in = static_cast<const vec4_s8*>(__builtin_assume_aligned(&src.p[i * decimation_factor], 4));
+
+		complex32_t accum;
+
+		// Oldest samples are discarded.
+		accum = mac_fs4_shift(z, t, 0, accum);
+		accum = mac_fs4_shift(z, t, 1, accum);
+
+		// Middle samples are shifted earlier in the "z" delay buffer.
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 0, accum);
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 1, accum);
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 2, accum);
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 3, accum);
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 4, accum);
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 5, accum);
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 6, accum);
+		accum = mac_fs4_shift_and_store(z, t, decimation_factor, 7, accum);
+
+		// Newest samples come from "in" buffer, are copied to "z" delay buffer.
+		accum = mac_fs4_shift_and_store_new_c8_samples(z, t, in, decimation_factor, 0, taps_count, accum);
+		accum = mac_fs4_shift_and_store_new_c8_samples(z, t, in, decimation_factor, 1, taps_count, accum);
+
+		d[i] = scale_round_and_pack(accum, k);
+	}
+
+	return {
+		dst.p,
+		count,
+		src.sampling_rate / decimation_factor
+	};
+}
+
 // FIRC8xR16x24FS4Decim8 //////////////////////////////////////////////////
 
 FIRC8xR16x24FS4Decim8::FIRC8xR16x24FS4Decim8() {
@@ -242,6 +307,60 @@ buffer_c16_t FIRC8xR16x24FS4Decim8::execute(
 	};
 }
 
+// FIRC16xR16x16Decim2 ////////////////////////////////////////////////////
+
+FIRC16xR16x16Decim2::FIRC16xR16x16Decim2() {
+	z_.fill({});
+}
+
+void FIRC16xR16x16Decim2::configure(
+	const std::array<tap_t, taps_count>& taps,
+	const int32_t scale
+) {
+	std::copy(taps.cbegin(), taps.cend(), taps_.begin());
+	output_scale = scale;
+}
+
+buffer_c16_t FIRC16xR16x16Decim2::execute(
+	buffer_c16_t src,
+	buffer_c16_t dst
+) {
+	vec2_s16* const z = static_cast<vec2_s16*>(__builtin_assume_aligned(z_.data(), 4));
+	const vec2_s16* const t = static_cast<vec2_s16*>(__builtin_assume_aligned(taps_.data(), 4));
+	uint32_t* const d = static_cast<uint32_t*>(__builtin_assume_aligned(dst.p, 4));
+
+	const auto k = output_scale;
+
+	const size_t count = src.count / decimation_factor;
+	for(size_t i=0; i<count; i++) {
+		const vec2_s16* const in = static_cast<const vec2_s16*>(__builtin_assume_aligned(&src.p[i * decimation_factor], 4));
+
+		complex32_t accum;
+
+		// Oldest samples are discarded.
+		accum = mac_shift(z, t, 0, accum);
+
+		// Middle samples are shifted earlier in the "z" delay buffer.
+		accum = mac_shift_and_store(z, t, decimation_factor, 0, accum);
+		accum = mac_shift_and_store(z, t, decimation_factor, 1, accum);
+		accum = mac_shift_and_store(z, t, decimation_factor, 2, accum);
+		accum = mac_shift_and_store(z, t, decimation_factor, 3, accum);
+		accum = mac_shift_and_store(z, t, decimation_factor, 4, accum);
+		accum = mac_shift_and_store(z, t, decimation_factor, 5, accum);
+
+		// Newest samples come from "in" buffer, are copied to "z" delay buffer.
+		accum = mac_shift_and_store_new_c16_samples(z, t, in, decimation_factor, 0, taps_count, accum);
+
+		d[i] = scale_round_and_pack(accum, k);
+	}
+
+	return {
+		dst.p,
+		count,
+		src.sampling_rate / decimation_factor
+	};
+}
+
 // FIRC16xR16x32Decim8 ////////////////////////////////////////////////////
 
 FIRC16xR16x32Decim8::FIRC16xR16x32Decim8() {
diff --git a/firmware/baseband/dsp_decimate.hpp b/firmware/baseband/dsp_decimate.hpp
index 5079f259..97f02703 100644
--- a/firmware/baseband/dsp_decimate.hpp
+++ b/firmware/baseband/dsp_decimate.hpp
@@ -92,6 +92,38 @@ private:
 	const std::array<int16_t, taps_count>& taps;
 };
 
+class FIRC8xR16x24FS4Decim4 {
+public:
+	static constexpr size_t taps_count = 24;
+	static constexpr size_t decimation_factor = 4;
+
+	using sample_t = complex8_t;
+	using tap_t = int16_t;
+
+	enum class Shift : bool {
+		Down = true,
+		Up = false
+	};
+
+	FIRC8xR16x24FS4Decim4();
+
+	void configure(
+		const std::array<tap_t, taps_count>& taps,
+		const int32_t scale,
+		const Shift shift = Shift::Down
+	);
+
+	buffer_c16_t execute(
+		buffer_c8_t src,
+		buffer_c16_t dst
+	);
+	
+private:
+	std::array<vec2_s16, taps_count - decimation_factor> z_;
+	std::array<tap_t, taps_count> taps_;
+	int32_t output_scale = 0;
+};
+
 class FIRC8xR16x24FS4Decim8 {
 public:
 	static constexpr size_t taps_count = 24;
@@ -124,6 +156,32 @@ private:
 	int32_t output_scale = 0;
 };
 
+class FIRC16xR16x16Decim2 {
+public:
+	static constexpr size_t taps_count = 16;
+	static constexpr size_t decimation_factor = 2;
+
+	using sample_t = complex16_t;
+	using tap_t = int16_t;
+
+	FIRC16xR16x16Decim2();
+
+	void configure(
+		const std::array<tap_t, taps_count>& taps,
+		const int32_t scale
+	);
+
+	buffer_c16_t execute(
+		buffer_c16_t src,
+		buffer_c16_t dst
+	);
+	
+private:
+	std::array<vec2_s16, taps_count - decimation_factor> z_;
+	std::array<tap_t, taps_count> taps_;
+	int32_t output_scale = 0;
+};
+
 class FIRC16xR16x32Decim8 {
 public:
 	static constexpr size_t taps_count = 32;