genlm · shepardxia · Jan 8, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -9,7 +9,7 @@ on:
 
 
 jobs:
-  test_coverage:
+  test_vllm_coverage:
     runs-on: ParallelHoss
 
     steps:
@@ -25,13 +25,47 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install -U pip
+          pip install -e .[vllm]
           pip install -e .[lora]
           pip install --force-reinstall 'triton==3.2.0'
           pip install -r requirements-dev.txt
 
       - name: Run tests
         run: |
-          coverage run --source=genlm/backend -m pytest --benchmark-disable --ignore=tests/test_mlx.py
+          coverage run --source=genlm/backend -m pytest --benchmark-disable --ignore=tests/test_mlx.py --ignore=tests/test_sgl.py
+          coverage json --omit "*/test*"
+          coverage report --omit "*/test*"
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          fail_ci_if_error: false
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.json
+          slug: genlm/genlm-backend
+
+  test_sgl_coverage:
+    runs-on: ParallelHoss
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.11.5
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U pip
+          pip install -e .[sgl]
+          pip install -r requirements-dev.txt
+
+      - name: Run tests
+        run: |
+          coverage run --source=genlm/backend -m pytest tests/test_sgl.py
           coverage json --omit "*/test*"
           coverage report --omit "*/test*"
 

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 </div>
 
-GenLM Backend is a high-performance backend for language model probabilistic programs, built for the GenLM ecosystem. It provides an **asynchronous** and **autobatched** interface to `vllm` and `transformers` language models, enabling scalable and efficient inference.
+GenLM Backend is a high-performance backend for language model probabilistic programs, built for the GenLM ecosystem. It provides an **asynchronous** and **autobatched** interface to language model inference via `vllm`, `sglang`, `transformers`, and `mlx-lm`.
 
 See our [documentation](https://genlm.github.io/genlm-backend/).
 
@@ -23,19 +23,21 @@ See our [documentation](https://genlm.github.io/genlm-backend/).
 
 ## ⚡ Quick Start
 
-This library supports installation via pip:
+This library supports installation via pip. This uses `transformers` as the default inference backend.
 
 ```bash
 pip install genlm-backend
 ```
 
-Or to install with MLX support, run:
+To use a different backend, install the corresponding extra:
 
 ```bash
 pip install genlm-backend[mlx]
+pip install genlm-backend[vllm]
+pip install genlm-backend[sgl]
 ```
 
-Or to install with LoRA support, run:
+For LoRA support:
 
 ```bash
 pip install genlm-backend[lora]

diff --git a/benchmark/benchmark_backend.py b/benchmark/benchmark_backend.py
@@ -14,29 +14,29 @@
     run_await_batch_next_token_logprobs,
 )
 
-from genlm.backend.llm import AsyncVirtualLM, AsyncTransformer
+from genlm.backend.llm import load_model_by_name
 
 text = get_wikitext()
 
 
-def load_model(model, batch_size=None):
+def load_model(backend, batch_size=None):
     model_name = "gpt2"
-    if model == "vllm":
-        return AsyncVirtualLM.from_name(model_name)
+    if backend in ["vllm", "sglang"]:
+        return load_model_by_name(model_name, backend=backend)
     else:
-        return AsyncTransformer.from_name(model_name, batch_size=batch_size)
+        return load_model_by_name(model_name, backend=backend, batch_size=batch_size)
 
 
-@pytest.mark.parametrize("model", ["vllm", "transformer"])
-def test_await_next_token_logprobs(benchmark, model):
-    llm = load_model(model, batch_size=1)
+@pytest.mark.parametrize("backend", ["vllm", "sglang"])
+def test_await_next_token_logprobs(benchmark, backend):
+    llm = load_model(backend, batch_size=1)
     sequences = token_prefixes(text, tokenizer=llm.tokenizer)
     run_await_next_token_logprobs(benchmark=benchmark, llm=llm, sequences=sequences)
 
 
-@pytest.mark.parametrize("model", ["vllm", "transformer"])
-def test_await_batch_next_token_logprobs(benchmark, model, batch_size=20):
-    llm = load_model(model, batch_size=batch_size)
+@pytest.mark.parametrize("backend", ["vllm", "sglang"])
+def test_await_batch_next_token_logprobs(benchmark, backend, batch_size=20):
+    llm = load_model(backend, batch_size=batch_size)
     batches = token_prefix_batches(text, tokenizer=llm.tokenizer, batch_size=batch_size)
     run_await_batch_next_token_logprobs(
         benchmark=benchmark, llm=llm, batches=batches, rounds=50, warmup_rounds=10

diff --git a/genlm/backend/llm/__init__.py b/genlm/backend/llm/__init__.py
@@ -2,6 +2,7 @@
 from genlm.backend.llm.hf import AsyncTransformer
 from genlm.backend.llm.base import AsyncLM, MockAsyncLM
 from genlm.backend.llm.mlx import AsyncMlxLM
+from genlm.backend.llm.sgl import AsyncSGLTransformer
 
 import torch
 
@@ -11,7 +12,7 @@ def load_model_by_name(name, backend=None, llm_opts=None):
 
     Args:
         name (str): Hugging Face model name (e.g. "gpt2", "meta-llama/Llama-3.2-1B-Instruct")
-        backend (str, optional): Backend to use for inference. Can be "vllm", "hf" or "mock".
+        backend (str, optional): Backend to use for inference. Can be "vllm", "hf", "mlx", "sgl", or "mock".
             If None, defaults to "vllm" if CUDA is available, otherwise "hf".
         llm_opts (dict, optional): Additional options to pass to the backend constructor.
             See AsyncVirtualLM and AsyncTransformer documentation for details.
@@ -36,6 +37,8 @@ def load_model_by_name(name, backend=None, llm_opts=None):
         return MockAsyncLM.from_name(name, **llm_opts)
     elif backend == "mlx":
         return AsyncMlxLM.from_name(name, **llm_opts)
+    elif backend == "sgl":
+        return AsyncSGLTransformer.from_name(name, **llm_opts)
     else:
         raise ValueError(f"Invalid backend: {backend}")
 
@@ -46,5 +49,6 @@ def load_model_by_name(name, backend=None, llm_opts=None):
     "AsyncVirtualLM",
     "AsyncTransformer",
     "AsyncMlxLM",
+    "AsyncSGLTransformer",
     "MockAsyncLM",
 ]