chore: remove failed runs from results

ruyadorno · ruyadorno · commit 754bfcb4fedf · 2025-08-06T16:58:19.000-04:00
Adds a new step to the `process` job that cleans up benchmark results for each combination run for all different package managers. This way we can be sure that failed runs are not impacting result numbers, while also allowing for a more resilient system that will still be able to complete a full benchmark even if a few runs for a specific package manager + fixture + variation fails to complete. Fixes: #11
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
@@ -5,7 +5,7 @@ on:
     inputs:
       fixtures:
         description: 'The fixture to run the benchmarks on'
-        default: '["next", "astro", "svelte", "vue"]'
+        default: '["next", "astro", "vue", "svelte"]'
       variations:
         description: 'The benchmark variations to run'
         default: '["cache", "cache+lockfile", "cache+node_modules", "cache+lockfile+node_modules", "clean", "lockfile", "lockfile+node_modules", "node_modules"]'
@@ -80,6 +80,116 @@ jobs:
         with:
           path: results
           pattern: results-*
+      - name: Clean benchmarks result
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+
+            // Define fixture and variation values from the strategy matrix
+            const fixtures = [
+              "next",
+              "astro",
+              "vue",
+              "svelte",
+              "run"
+            ];
+            const variations = [
+              "cache",
+              "cache+lockfile",
+              "cache+node_modules",
+              "cache+lockfile+node_modules",
+              "clean",
+              "lockfile",
+              "lockfile+node_modules",
+              "node_modules",
+              "run"
+            ];
+
+            // Helper functions for statistical calculations
+            function calculateMean(times) {
+              return times.reduce((sum, time) => sum + time, 0) / times.length;
+            }
+
+            function calculateStddev(times, mean) {
+              const variance = times.reduce((sum, time) => sum + Math.pow(time - mean, 2), 0) / times.length;
+              return Math.sqrt(variance);
+            }
+
+            function calculateMedian(times) {
+              const sorted = [...times].sort((a, b) => a - b);
+              const mid = Math.floor(sorted.length / 2);
+              return sorted.length % 2 === 0
+                ? (sorted[mid - 1] + sorted[mid]) / 2
+                : sorted[mid];
+            }
+
+            // Clean benchmark results
+            for (const fixture of fixtures) {
+              for (const variation of variations) {
+                // we only handle one specific combination for run in which both its
+                // fixture and variation are named run. if it's anything else, we skip it.
+                const skipInvalidRunFixtures = variation === "run" && fixture !== "run";
+                const skipInvalidRunVariations = fixture === "run" && variation !== "run";
+                if (skipInvalidRunFixtures || skipInvalidRunVariations) {
+                  continue;
+                }
+
+                const benchmarkPath = path.join('results', `results-${fixture}-${variation}`, 'benchmarks.json');
+
+                try {
+                  console.log(`Cleaning benchmark file: ${benchmarkPath}`);
+                  const benchmarkData = JSON.parse(fs.readFileSync(benchmarkPath, 'utf8'));
+
+                  if (benchmarkData.results && benchmarkData.results.length > 0) {
+                    for (let i = 0; i < benchmarkData.results.length; i++) {
+                      const result = benchmarkData.results[i];
+                      const { times, exit_codes } = result;
+
+                      if (times && exit_codes && times.length === exit_codes.length) {
+                        // Filter out times where exit_codes is not 0
+                        const cleanTimes = times.filter((time, index) => exit_codes[index] === 0);
+                        const cleanExitCodes = exit_codes.filter(code => code === 0);
+
+                        if (cleanTimes.length > 0) {
+                          // Recalculate statistics
+                          const mean = calculateMean(cleanTimes);
+                          const stddev = calculateStddev(cleanTimes, mean);
+                          const median = calculateMedian(cleanTimes);
+                          const min = Math.min(...cleanTimes);
+                          const max = Math.max(...cleanTimes);
+
+                          // Update the result object
+                          result.times = cleanTimes;
+                          result.exit_codes = cleanExitCodes;
+                          result.mean = mean;
+                          result.stddev = stddev;
+                          result.median = median;
+                          result.min = min;
+                          result.max = max;
+
+                          console.log(`Cleaned ${fixture}-${variation} (result ${i}): ${times.length - cleanTimes.length} failed runs removed, ${cleanTimes.length} valid runs remaining`);
+                        } else {
+                          console.warn(`All runs failed for ${fixture}-${variation} (result ${i})`);
+                        }
+                      } else {
+                        console.warn(`Invalid times/exit_codes arrays for ${fixture}-${variation} (result ${i})`);
+                      }
+                    }
+
+                    // Save the cleaned data back to the file
+                    fs.writeFileSync(benchmarkPath, JSON.stringify(benchmarkData, null, 2));
+                  } else {
+                    console.warn(`No results found in ${benchmarkPath}`);
+                  }
+                } catch (error) {
+                  console.error(`Failed to clean ${benchmarkPath}: ${error.message}`);
+                }
+              }
+            }
+
+            console.log('Benchmark cleaning completed');
       - name: Process Results
         run: |
           bash ./scripts/process-results.sh