Increase MIDI speed by 20% by saturating at end (earlephilhower#792)

earlephilhower · web-flow · commit 41b96ef000d0 · 2025-10-22T18:32:35.000-07:00
By rendering into a 32b int we don't have to saturate check in the inner
loop of the voice render, which is called multiple times per actual output
block.  That check is very expensive.

In tsf_render_short make the input `short *buffer` have 2x the space as
normal to allow the full 32b quantity for each sample to accumulate.
After all voices have been rendered, then resample with saturation into
int16_t values.

About 24% faster on the Pico while still saturating properly.
diff --git a/lib/TinySoundFont b/lib/TinySoundFont
@@ -1 +1 @@
-Subproject commit 864d8888f00ef2c689ca26f382808d58993b9193
+Subproject commit e2b7b62c8ba1b368aa6f5c64bc8a7b4a2286604a
diff --git a/src/AudioGeneratorMIDI.cpp b/src/AudioGeneratorMIDI.cpp
@@ -559,11 +559,11 @@ bool AudioGeneratorMIDI::loop() {
                 break;
             }
         } else if (samplesToPlay) {
-            numSamplesRendered = (sizeof(samplesRendered) / sizeof(samplesRendered[0])) / 2;
-            if ((int)samplesToPlay < (int)(sizeof(samplesRendered) / sizeof(samplesRendered[0])) / 2) {
+            numSamplesRendered = (sizeof(samplesRendered) / sizeof(samplesRendered[0])) / 4;
+            if ((int)samplesToPlay < (int)(sizeof(samplesRendered) / sizeof(samplesRendered[0])) / 4) {
                 numSamplesRendered = samplesToPlay;
             }
-            tsf_render_short(g_tsf, samplesRendered, numSamplesRendered, 0);
+            tsf_render_short_2x(g_tsf, samplesRendered, numSamplesRendered, 0);
             samplesToPlay -= numSamplesRendered;
             sentSamplesRendered = 0;
         } else {
diff --git a/src/AudioGeneratorMIDI.h b/src/AudioGeneratorMIDI.h
@@ -186,7 +186,7 @@ class AudioGeneratorMIDI : public AudioGenerator {
     bool sawEOF;
     int numSamplesRendered;
     int sentSamplesRendered ;
-    short samplesRendered[256 * 2];
+    short samplesRendered[256 * 2 * 2];
 
     tsf *_tsf = nullptr;
 };
diff --git a/src/libtinysoundfont/tsf.h b/src/libtinysoundfont/tsf.h
@@ -210,7 +210,12 @@ TSFDEF int tsf_active_voice_count(tsf* f);
 //   buffer: target buffer of size samples * output_channels * sizeof(type)
 //   samples: number of samples to render
 //   flag_mixing: if 0 clear the buffer first, otherwise mix into existing data
+#ifndef TSF_SAMPLES_SHORT
 TSFDEF void tsf_render_short(tsf* f, short* buffer, int samples, int flag_mixing CPP_DEFAULT0);
+#else
+// The short render needs space for a long (32b) for all samples, so pass in a buffer of 2x the expected size.  Only the first `samples` samples will be valid on return
+TSFDEF void tsf_render_short_2x_buffer(tsf* f, short* buffer_2x, int samples, int flag_mixing CPP_DEFAULT0);
+#endif
 TSFDEF void tsf_render_float(tsf* f, float* buffer, int samples, int flag_mixing CPP_DEFAULT0);
 
 // Higher level channel based functions, set up channel parameters
@@ -1536,7 +1541,7 @@ static void tsf_voice_render(tsf* f, struct tsf_voice* v, float* outputBuffer, i
 	if (tmpLowpass.active || dynamicLowpass) v->lowpass = tmpLowpass;
 }
 #else
-static void tsf_voice_render_short(tsf* f, struct tsf_voice* v, short* outputBuffer, int numSamples)
+static void tsf_voice_render_short(tsf* f, struct tsf_voice* v, int32_t* outputBuffer, int numSamples)
 {
 #ifdef ESP8266
     static unsigned int smps = 0;
@@ -1547,7 +1552,7 @@ static void tsf_voice_render_short(tsf* f, struct tsf_voice* v, short* outputBuf
 #endif
 	TSF_CONST struct tsf_region* region = v->region;
         TSF_CONST short* input = f->shortSamples;
-	short* outL = outputBuffer;
+	int32_t* outL = outputBuffer;
 	//short* outR = (f->outputmode == TSF_STEREO_UNWEAVED ? outL + numSamples : TSF_NULL);
 
 	// Cache some values, to give them at least some chance of ending up in registers.
@@ -1643,19 +1648,20 @@ static void tsf_voice_render_short(tsf* f, struct tsf_voice* v, short* outputBuf
                                         // Do saturating adds for each channel
                                         fixed16p16 smp;
 
+                                        // No clipping because it eats 20% of the performance on the M0+ Pico!
+
                                         smp = *outL;
                                         smp += (val * gainLeftF16P16) >> 16;
-                                        if (smp > 32767) smp = 32767;
-                                        else if (smp < -32768) smp = -32768;
+//                                        if (smp > 32767) smp = 32767;
+//                                        else if (smp < -32768) smp = -32768;
                                         *outL++ = smp;
 
                                         smp = *outL;
                                         smp += (val * gainRightF16P16) >> 16;
-                                        if (smp > 32767) smp = 32767;
-                                        else if (smp < -32768) smp = -32768;
+//                                        if (smp > 32767) smp = 32767;
+//                                        else if (smp < -32768) smp = -32768;
                                         *outL++ = smp;
 
-
 //					*outL++ += val * gainLeft;
 //					*outL++ += val * gainRight;
 
@@ -2161,13 +2167,20 @@ TSFDEF void tsf_render_float(tsf* f, float* buffer, int samples, int flag_mixing
 			tsf_voice_render(f, v, buffer, samples);
 }
 #else
-TSFDEF void tsf_render_short(tsf* f, short* buffer, int samples, int flag_mixing)
+TSFDEF void tsf_render_short_2x(tsf* f, short* buffer, int samples, int flag_mixing)
 {
         struct tsf_voice *v = f->voices, *vEnd = v + f->voiceNum;
-        if (!flag_mixing) TSF_MEMSET(buffer, 0, (f->outputmode == TSF_MONO ? 1 : 2) * sizeof(short) * samples);
+        int32_t *buffer32 = (int32_t *)buffer;
+        if (!flag_mixing) TSF_MEMSET(buffer, 0, (f->outputmode == TSF_MONO ? 1 : 2) * sizeof(short) * samples * 2 /* We sum in 32bs then downsample here to 16b to minimized saturation calcs */);
         for (; v != vEnd; v++)
                 if (v->playingPreset != -1)
-                        tsf_voice_render_short(f, v, buffer, samples);
+                        tsf_voice_render_short(f, v, buffer32, samples);
+        for (int i = 0; i < samples * 2; i++) {
+            int32_t t = buffer32[i];
+            if (t > 32767) buffer[i] = 32767;
+            else if (t < -32768) buffer[i] = -32768;
+            else buffer[i] = t;
+        }
 }
 #endif