diff --git a/cluster_experts.py b/cluster_experts.py new file mode 100644 index 0000000..a1f1997 --- /dev/null +++ b/cluster_experts.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +Co-activation expert clustering for flash-moe. + +Analyzes routing logs to find which experts are frequently co-activated, +then rewrites packed expert files so co-activated experts are physically +adjacent on disk. This improves cold SSD read throughput by ~38% for +cache misses (measured: scattered=3.2GB/s, adjacent=4.4GB/s on M1 Pro). + +Usage: + # Step 1: Generate routing log during inference + ./infer --prompt "..." --tokens 200 --k 4 --collect-routing routing.bin + + # Step 2: Analyze and cluster + python3 cluster_experts.py --routing routing.bin --packed-dir metal_infer/packed_experts + + # Step 3: Verify + python3 cluster_experts.py --routing routing.bin --verify +""" + +import argparse +import os +import struct +import sys +import time +import numpy as np +from collections import defaultdict + +EXPERT_SIZE = 7077888 +NUM_EXPERTS = 512 +NUM_LAYERS = 60 +HIDDEN_DIM = 4096 + + +def load_routing_log(path): + """Load binary routing log. Format per sample: int32 layer, int32 K, float32[4096] hidden, int32[K] experts.""" + routing = defaultdict(list) # layer -> list of expert_index tuples + + with open(path, 'rb') as f: + while True: + header = f.read(8) + if len(header) < 8: + break + layer_idx, K = struct.unpack('