-
Notifications
You must be signed in to change notification settings - Fork 0
/
searchindex.js
1 lines (1 loc) · 57.7 KB
/
searchindex.js
1
Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[16, "achieving-peak-throughput"]], "Add Unit Tests": [[13, "add-unit-tests"]], "Add a Runner": [[8, "add-a-runner"]], "Add the model to the test suite": [[20, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[16, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[11, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Batches": [[4, "Batches"]], "Batching": [[10, "batching"]], "Benchmark": [[12, "benchmark"]], "Benchmark and Profiling": [[12, null]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[21, "cuda-error-an-illegal-memory-access-was-encountered"]], "CUDA out of memory": [[21, "cuda-out-of-memory"]], "Chat Completions": [[4, "Chat-Completions"]], "Chat Template": [[6, "Chat-Template"]], "Choices Methods in SGLang": [[9, null]], "Classify (reward model)": [[2, "Classify-(reward-model)"]], "Clean": [[0, "clean"]], "Common Notes": [[22, "common-notes"]], "Completions": [[4, "Completions"]], "Constrained Decoding": [[10, "constrained-decoding"]], "Contributor Guide": [[13, null]], "Control Flow": [[10, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[14, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Models": [[20, "embedding-models"]], "Encode (embedding model)": [[2, "Encode-(embedding-model)"]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Example: Run Llama 3.1 405B": [[1, "example-run-llama-3-1-405b"]], "Examples": [[19, "examples"]], "Flush Cache": [[2, "Flush-Cache"]], "Format Your Code": [[13, "format-your-code"]], "Frequently Asked Questions": [[15, null]], "Frontend Tutorial": [[11, null]], "Frontend: Structured Generation Language (SGLang)": [[10, null]], "Generate (text generation model)": [[2, "Generate-(text-generation-model)"]], "Generative Models": [[20, "generative-models"]], "Get Memory Pool Size": [[2, "Get-Memory-Pool-Size"]], "Get Model Info": [[2, "Get-Model-Info"]], "Get Server Args": [[2, "Get-Server-Args"]], "Getting Started": [[11, null]], "Grafana Dashboard": [[18, "grafana-dashboard"]], "Greedy Token Selection": [[9, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[16, null]], "Health Check": [[2, "Health-Check"]], "How to Support a New Model": [[20, "how-to-support-a-new-model"]], "Install SGLang": [[22, null]], "Interactive debugging": [[20, "interactive-debugging"]], "JSON": [[4, "JSON"]], "JSON Decoding": [[10, "json-decoding"]], "JSON Format": [[14, "json-format"]], "Jinja Format": [[14, "jinja-format"]], "Language Feature": [[10, "language-feature"]], "Launch A Server": [[2, "Launch-A-Server"], [4, "Launch-A-Server"], [5, "Launch-A-Server"], [6, "Launch-A-Server"], [23, "Launch-A-Server"]], "Learn more": [[17, null]], "Make a release in GitHub": [[7, "make-a-release-in-github"]], "Method 1: With pip": [[22, "method-1-with-pip"]], "Method 2: From source": [[22, "method-2-from-source"]], "Method 3: Using docker": [[22, "method-3-using-docker"]], "Method 4: Using docker compose": [[22, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[22, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[9, "methods"]], "More Examples": [[10, "more-examples"]], "Multi modal": [[19, "multi-modal"]], "Multi-Modality": [[10, "multi-modality"]], "Multiple-Image Inputs": [[6, "Multiple-Image-Inputs"]], "Native APIs": [[2, null]], "Non-streaming Asynchronous Generation": [[3, "Non-streaming-Asynchronous-Generation"]], "Non-streaming Synchronous Generation": [[3, "Non-streaming-Synchronous-Generation"]], "Normal": [[19, "normal"]], "Offline Batch Inference": [[3, "Offline-Batch-Inference"]], "Offline Engine API": [[3, null]], "OpenAI APIs - Completions": [[4, null]], "OpenAI APIs - Embedding": [[5, null]], "OpenAI APIs - Vision": [[6, null]], "OpenAI Compatible API": [[1, "openai-compatible-api"]], "Other tips": [[12, "other-tips"]], "Parallelism": [[10, "parallelism"]], "Parameters": [[4, "Parameters"], [4, "id2"]], "Port a model from vLLM to SGLang": [[20, "port-a-model-from-vllm-to-sglang"]], "Production Metrics": [[18, null]], "Profile with Nsight": [[12, "profile-with-nsight"]], "PyPI Package Release Process": [[7, null]], "Quick Start": [[1, "quick-start"], [10, "quick-start"]], "Quick Start: Sending Requests": [[23, null]], "References": [[11, null]], "Regular expression": [[4, "Regular-expression"]], "Reward Models": [[20, "reward-models"]], "Roles": [[10, "roles"]], "SGLang Documentation": [[0, null], [11, null]], "Sampling Parameters in SGLang Runtime": [[19, null]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-Hosted Runners for GitHub Action": [[8, null]], "Setup Guide": [[18, "setup-guide"]], "Step 1: Start a docker container.": [[8, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[8, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[8, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[10, "streaming"], [19, "streaming"], [23, "Streaming"], [23, "id1"]], "Streaming Asynchronous Generation": [[3, "Streaming-Asynchronous-Generation"]], "Streaming Synchronous Generation": [[3, "Streaming-Synchronous-Generation"]], "Structured decoding (JSON, Regex)": [[4, "Structured-decoding-(JSON,-Regex)"], [19, "structured-decoding-json-regex"]], "Supported Models": [[20, null]], "Test the correctness": [[20, "test-the-correctness"]], "The results are not deterministic, even with a temperature of 0": [[15, "the-results-are-not-deterministic-even-with-a-temperature-of-0"]], "Tips and Implementation Details": [[10, "tips-and-implementation-details"]], "Token Length Normalized": [[9, "token-length-normalized"]], "Troubleshooting": [[21, null]], "Try Advanced Options": [[16, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[16, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[16, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[16, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[16, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[9, "unconditional-likelihood-normalized"]], "Update Weights": [[2, "Update-Weights"]], "Update the version in code": [[7, "update-the-version-in-code"]], "Upload the PyPI package": [[7, "upload-the-pypi-package"]], "Usage": [[4, "Usage"], [4, "id1"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[5, "Using-Input-IDs"]], "Using Local Models": [[10, "using-local-models"]], "Using Native Generation APIs": [[23, "Using-Native-Generation-APIs"]], "Using OpenAI Models": [[10, "using-openai-models"]], "Using OpenAI Python Client": [[5, "Using-OpenAI-Python-Client"], [6, "Using-OpenAI-Python-Client"], [23, "Using-OpenAI-Python-Client"]], "Using Python Requests": [[5, "Using-Python-Requests"], [6, "Using-Python-Requests"], [23, "Using-Python-Requests"]], "Using cURL": [[5, "Using-cURL"], [6, "Using-cURL"], [23, "Using-cURL"]]}, "docnames": ["README", "backend/backend", "backend/native_api", "backend/offline_engine_api", "backend/openai_api_completions", "backend/openai_api_embeddings", "backend/openai_api_vision", "developer/release_process", "developer/setup_github_runner", "frontend/choices_methods", "frontend/frontend", "index", "references/benchmark_and_profiling", "references/contributor_guide", "references/custom_chat_template", "references/faq", "references/hyperparameter_tuning", "references/learn_more", "references/production_metrics", "references/sampling_params", "references/supported_models", "references/troubleshooting", "start/install", "start/send_request"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend/backend.md", "backend/native_api.ipynb", "backend/offline_engine_api.ipynb", "backend/openai_api_completions.ipynb", "backend/openai_api_embeddings.ipynb", "backend/openai_api_vision.ipynb", "developer/release_process.md", "developer/setup_github_runner.md", "frontend/choices_methods.md", "frontend/frontend.md", "index.rst", "references/benchmark_and_profiling.md", "references/contributor_guide.md", "references/custom_chat_template.md", "references/faq.md", "references/hyperparameter_tuning.md", "references/learn_more.md", "references/production_metrics.md", "references/sampling_params.md", "references/supported_models.md", "references/troubleshooting.md", "start/install.md", "start/send_request.ipynb"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [2, 3, 4, 5, 6, 9, 10, 16, 18, 19, 20, 22, 23], "0": [1, 2, 3, 4, 5, 6, 8, 10, 16, 18, 19, 22, 23], "00": [2, 3, 4, 5, 6, 23], "000": 4, "0000": 16, "0006747245788574219": 5, "0006804466247558594": [2, 5], "000682830810546875": [2, 5], "001": 18, "0020961761474609375": 5, "0020999908447265625": [2, 5], "003025054931640625": 5, "0030345916748046875": [2, 5], "005": 18, "006198883056640625": 5, "006214141845703125": [2, 5], "007263183594": 18, "00807952880859375": [2, 5], "00830078125": 5, "00830841064453125": [2, 5], "009002685546875": [2, 5], "00it": 4, "01": [2, 3, 4, 5, 6, 10, 16, 18, 23], "01239013671875": [2, 5], "01438140869140625": [2, 5], "015": 18, "01it": 23, "02": [2, 3, 4, 6, 18, 23], "025": 18, "03": [2, 4, 5, 6, 18, 23], "04": [2, 4, 6, 8, 18, 23], "05": [2, 4, 5, 18], "05it": 6, "06": [2, 4, 5, 18], "06it": 2, "07": [2, 4, 5, 23], "075": 18, "0780844688416": 18, "08": [2, 4, 5, 18, 23], "08it": 2, "09": [2, 4, 23], "09212830428b": 4, "09375": 2, "09d8d0c6125649f9b1bdd044fbae9f87": 6, "0_rocm6": 8, "0_triton3": 8, "1": [2, 3, 4, 5, 6, 10, 12, 16, 18, 19, 20, 23], "10": [2, 4, 5, 6, 12, 18], "100": [2, 3, 4, 5, 6, 10, 18, 23], "1000": 18, "10000": 18, "100000": 18, "10189": 4, "1024": 22, "1025173": [2, 5], "1034164859": 23, "10407": 4, "106": [6, 23], "10626": 4, "107": 4, "10835": 4, "10it": 6, "11": [2, 3, 4, 5, 6, 23], "110": 4, "11118": 4, "112025": 4, "1182": 18, "1187": 18, "119": 4, "11b": 6, "11it": 4, "12": [2, 4, 6, 8, 22], "123": 4, "12325": 4, "1266": 18, "127": [1, 2, 4, 5, 6, 23], "1270": 18, "128": [1, 4, 19, 22], "128008": 2, "128009": [4, 6, 23], "12895": 6, "128g": 8, "129": 4, "12917": 6, "12it": 23, "13": [2, 4, 6, 23], "130": 4, "131072": [2, 4, 5, 6, 23], "132": 3, "132025": 4, "133": [4, 23], "1335": 15, "134": 4, "1350": 18, "139394": 18, "14": [2, 4, 5, 6], "145": 4, "147": 4, "14it": 3, "15": [2, 3, 4, 18, 23], "150": 4, "152025": 4, "1563": 18, "15it": 6, "16": [1, 2, 3, 4, 10], "160": [2, 4, 5, 6, 23], "1600": 3, "161721": 18, "1623": 18, "16325": 4, "16384": [2, 4, 5, 6, 18, 23], "16g": 22, "17": [4, 5], "172": 1, "172025": 4, "1729": 15, "1731366990": 4, "1731366992": 4, "1731366993": 4, "1731366996": 4, "1731367151": 6, "1731367152": 6, "1731367207": 23, "1731367208": 23, "1748": 18, "175": 4, "1763311": 2, "1764147": 2, "1764978": 2, "1766732": 4, "1767653": 5, "1768567": 6, "1769938": 23, "18": [2, 4, 5, 6], "18169": 4, "187": 4, "1883": 18, "18850": 4, "19": [2, 4, 6, 23], "191": 18, "192025": 4, "19649": 4, "19db66d1ee9f": 4, "19th": 3, "1a904a360f0a4622a0a1cb9b45cc66fd": 23, "1b": 2, "1b043660b4f0421ba52240255876f5df": 6, "1d3e67e0": 4, "2": [1, 2, 3, 4, 5, 6, 10, 14, 18, 19, 20, 23], "20": [3, 4, 6, 18, 23], "200": [2, 4, 5, 6, 18, 23], "2000": 18, "20000": [1, 18], "202": 18, "2024": [2, 4, 5, 6, 23], "20325": 4, "20407": 4, "2048": [2, 12, 16, 21], "2049": [2, 4, 6, 23], "207": 18, "2095": 18, "20afdd603cd34b86a275d1cb771dfa75": 23, "21": [2, 4, 5, 18, 23], "210": 4, "2102": 18, "2104": 18, "2124": 18, "21245": 4, "2147000": 4, "2170757": 2, "21a4": 4, "21it": 4, "22": [2, 3, 18], "22156": 4, "22it": 23, "23": [2, 3, 4, 5, 6, 23], "23182": 4, "2325": 4, "233": 16, "23it": [2, 6], "24": [2, 4], "24247": 4, "243": [2, 3, 4, 5, 6, 23], "24325": 4, "2448eb29": 4, "24885": 18, "24h": 4, "25": [2, 3, 4, 6, 10, 18, 23], "250": 4, "256": [2, 4, 5, 6, 10, 12, 23], "25it": [3, 6], "26": 4, "27": 4, "2790": 4, "27b": 20, "28": 4, "28325": 4, "288": 2, "29": [2, 5, 6], "298440": 6, "29846": 18, "3": [2, 3, 4, 5, 6, 10, 12, 14, 16, 18, 19, 20, 23], "30": [3, 4, 18, 23], "300": [2, 4, 5, 6, 23], "3000": 18, "30000": [1, 4, 5, 6, 10, 14, 18, 19, 20, 22, 23], "30010": 2, "30020": 2, "30030": 2, "30060": 18, "3072": 2, "31": [4, 5, 6, 23], "310": 4, "311": 18, "317": 16, "32": [1, 2, 4, 5, 6, 12, 19, 22, 23], "32025": 4, "32325": 4, "3237c596": 4, "3249": 4, "32g": 22, "33": [4, 5, 23], "3348": 4, "33982": 23, "33994": 23, "34": [2, 4, 5, 6, 23], "34002": 23, "34012": 23, "34022": 23, "34026": 23, "34034": 23, "34it": [2, 6], "35": [2, 3, 4, 6], "35002": 5, "35796": 2, "35798": 2, "35808": 2, "35810": 2, "35822": 2, "35830": 2, "35834": 2, "35848": 2, "35860": 2, "35it": 4, "36": [2, 4, 6], "36325": 4, "36851967": 2, "37": [2, 4, 23], "370959": 16, "371": 4, "373129261": 5, "375": 2, "37682": 4, "37778": 6, "37782": 6, "37798": 6, "37it": [2, 23], "38": 4, "38004": 4, "38170": 2, "3846": 4, "38it": 3, "39": [2, 3, 4, 5, 6, 23], "39152": 5, "39154": 5, "39168": 5, "39184": [5, 18], "39200": 5, "39212": 5, "39it": 2, "3b": 2, "3d0f8c1149c24ec89f660b4e9d74703b": 4, "4": [1, 2, 3, 4, 5, 6, 10, 18, 20, 23], "40": [2, 4, 5, 6, 23], "400": 2, "4005": [2, 5], "4010": 4, "40312": 2, "40316": 2, "40325": 4, "40326": 2, "40852": 6, "40864": 6, "40876": 6, "40880": 6, "409": 4, "4096": [1, 2, 4, 5, 6, 12, 16, 21, 23], "4097": [2, 18], "41": [2, 4, 5, 23], "414": 6, "41710": 23, "41712": 23, "42": [2, 4, 23], "421": 18, "422029": 18, "422424": 18, "422425": 18, "424529": 18, "424549": 18, "42dd": 4, "43": [2, 4, 5, 6, 23], "4325": 4, "43it": 2, "442913": [4, 23], "44325": 4, "44482": 2, "44486": 2, "44488": 2, "448": 18, "45": [2, 4], "450929": 2, "456": 18, "456c": 4, "45758": 4, "4579": 4, "4594": 16, "46": [4, 5, 6, 23], "464e": 4, "46fe": 4, "47": [4, 5, 6, 23], "48": [2, 4, 23], "48142": 4, "48144": 4, "48158": 4, "48325": 4, "49": [4, 6, 18], "4aaf": 4, "4e1f": 4, "5": [1, 2, 3, 4, 5, 6, 10, 18, 19, 20, 23], "50": [2, 3, 4, 16, 18, 23], "500": [4, 16, 18], "5000": 18, "50000": [1, 18], "500552": 18, "506780": 18, "51": [2, 4, 6, 23], "511": 18, "512": 12, "51e2": 4, "52": [1, 2, 4, 23], "52025": 4, "52325": 4, "53": 23, "53306": 2, "53746": 2, "53750": 2, "53758": 2, "5393948555": 18, "53b19cf0": 4, "54": [4, 18], "55": [4, 23], "55it": 3, "56": 4, "563": 18, "57": [2, 4, 5], "579f6aabc97c4e04873ace532f84f98b": 23, "58": [4, 6], "580217155": 2, "58446": 4, "58452": 4, "58464": 4, "59": [2, 4, 5, 6, 23], "598": 18, "5b": 20, "6": [2, 3, 4, 5, 6, 8, 20, 23], "60": [5, 6, 12, 23], "6000": 12, "61": [4, 18], "61387": 18, "61it": 2, "62": [2, 4, 23], "63": [4, 23], "64": [1, 2, 4, 5, 12, 19, 23], "6452": 6, "6462": 6, "6463": 6, "6472": 6, "6494": 6, "6496": 6, "6498": 6, "65": 4, "6512": 6, "656": 18, "66": 6, "67": 2, "67efefc903554ce0b21596a043c1425f": 2, "68": [2, 4, 5], "69": 4, "7": [1, 2, 4, 5, 6, 18, 23], "70": 12, "70517": 18, "707": 18, "709160898": 4, "71": [2, 5, 6], "719166376": 6, "71a00267421546d1b54d14046470da9a": 23, "71f2a76de66d": 4, "72025": 4, "72b": [6, 20], "73": [2, 4, 23], "733": 18, "74": [2, 4, 5], "75": [2, 3, 4, 18, 23], "76": [4, 5], "760856485": 2, "77": [2, 4], "78": [2, 4, 5, 6, 23], "78835": 18, "79": [4, 5, 6], "7918": 4, "7b": [1, 2, 5, 6, 14, 19, 20], "7fa2af80": 12, "8": [1, 2, 3, 4, 5, 6, 19, 20, 22, 23], "80": [5, 6], "8000": 0, "814": 18, "8192": [2, 4, 5, 6, 18, 23], "82": 16, "825": 18, "829": 4, "8325": 4, "84": 6, "86": [2, 4, 5, 23], "88": [2, 4, 5, 6, 23], "8852": 4, "8856": 4, "89": 4, "8b": [1, 2, 3, 4, 10, 12, 19, 20, 22, 23], "8de0": 4, "9": [1, 2, 4, 6, 10, 16], "900": 18, "9031": 4, "91": 5, "9181": 4, "91f0": 4, "92025": 4, "93": [4, 6, 23], "9367": 4, "94": 4, "95": [1, 3, 4, 15], "9556": 4, "9602": 18, "97": [2, 6, 23], "9765": 4, "979": 18, "99": [4, 6], "997": 18, "9973": 4, "99934530258": 18, "9998": 16, "9b": 18, "A": [3, 10, 12, 16, 22], "As": [3, 6], "At": 3, "But": 3, "By": [14, 19], "For": [1, 4, 9, 12, 20, 23], "If": [1, 14, 16, 19, 21, 22], "In": [1, 2, 3, 4, 5, 6, 10, 15, 23], "It": [1, 2, 3, 4, 6, 9, 10, 11, 14, 16, 19, 22], "NOT": 14, "No": 2, "On": 16, "One": 3, "Or": 1, "THE": 4, "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 16, 18, 19, 20, 22, 23], "Then": [8, 10, 18], "There": [3, 14], "To": [0, 1, 2, 3, 4, 5, 6, 10, 12, 15, 16, 18, 20, 22, 23], "__init__": 7, "__main__": 1, "__name__": 1, "_build": 0, "_tool": 6, "_work": 6, "a10": 22, "a100": 22, "a10a": 4, "a654": 4, "a7d4": 4, "a9c4": 4, "abl": 20, "about": [1, 3, 4, 10, 14, 15, 16, 17], "abov": [9, 12, 19, 21, 22], "ac76e655ca1": 4, "acceler": [1, 16, 22], "accept": [4, 19], "access": [0, 1, 3, 18, 22], "accord": [10, 12, 22], "account": [3, 15], "accumul": 15, "accur": 12, "achiev": [3, 4, 15], "across": [9, 15], "action": 6, "activ": 11, "ad": 22, "adapt": 3, "add": [1, 3, 5, 6, 10, 12, 15, 16, 19, 22], "add_safe_glob": 6, "addit": [3, 9, 10], "addition": 3, "addr": 1, "address": [1, 3, 10, 15], "adjust": [2, 6], "adopt": 11, "adv": 12, "advanc": [3, 4, 11], "aerospac": 3, "affect": 3, "after": [3, 23], "ag": 3, "again": 4, "against": 9, "ai": [1, 3, 4, 22], "aid": 3, "aim": [2, 3], "aith": 3, "alex": 3, "alia": 22, "alibaba": [2, 5, 20], "aliv": 10, "all": [0, 1, 3, 6, 8, 9, 10, 13, 16, 20, 22], "all_other_model": 20, "allow": [3, 6, 12, 22], "allowlist": 6, "almost": [1, 16, 20], "also": [1, 2, 3, 4, 5, 6, 10, 14, 15, 16, 19, 21, 23], "altern": [9, 10], "alwai": 16, "am": 3, "amazon": 3, "amd": [8, 22], "american": 3, "among": 2, "amplifi": 3, "an": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 18, 19, 22, 23], "analysi": 4, "ancient": 4, "ani": [1, 3, 6, 10, 19, 22], "ann": 3, "annot": 12, "annual": 3, "anoth": [3, 6, 20], "answer": [9, 10], "answer_1": 10, "answer_2": 10, "anthrop": 10, "anti": 3, "antidisestablishmentarian": 9, "anyon": [3, 23], "apart": 2, "api": [9, 10, 11, 14, 19, 20, 22], "api_kei": [1, 2, 4, 5, 6, 23], "appear": [3, 6, 19], "append": 4, "appli": 4, "applic": [1, 2, 3, 4, 5, 6, 11, 23], "apply_chat_templ": 2, "approach": [3, 22], "apt": [8, 12], "aqueduct": 4, "aquitain": 3, "ar": [1, 2, 3, 4, 5, 6, 9, 10, 12, 14, 16, 18, 19, 20, 22, 23], "arbitrari": 6, "arch": 8, "architectur": [2, 4, 12, 23], "area": 3, "arg": [9, 22], "argument": [2, 10, 12, 19], "aris": 15, "around": 3, "art": 23, "articl": 3, "artifici": 4, "ask": 11, "assembli": 4, "assert": [2, 4], "asset": 6, "assist": [1, 2, 4, 6, 9, 10, 14, 19, 23], "assistant_begin": 10, "assistant_end": 10, "assum": 18, "async": 3, "async_gener": 3, "asyncio": 3, "attach": 6, "attain": 16, "attent": [11, 20, 22], "attention_backend": [2, 4, 5, 6, 23], "attract": [3, 4, 9, 10], "audio": [4, 23], "augment": 3, "auror": 10, "australia": [4, 23], "author": 3, "auto": [2, 4, 5, 6, 23], "autom": 3, "automat": [2, 4, 6, 19], "autoregress": 10, "autosc": 22, "autotoken": [2, 5], "avail": [1, 2, 4, 5, 6, 18, 22, 23], "avenu": 3, "averag": 9, "avoid": [4, 21, 22], "aw": 3, "await": 3, "awar": 3, "awq": 11, "b": [2, 6, 22], "b2d8": 4, "b422": 4, "back": [2, 6, 11], "backend": [2, 9, 12, 22], "backend_input_fil": 4, "backend_result_fil": 4, "background": 6, "bad": [2, 9], "baichuan2": 20, "balanc": [4, 10], "base": [3, 9, 19], "base64": 19, "base_url": [1, 4, 5, 6, 23], "bash": [7, 8], "basi": 4, "basic": 3, "batch": [1, 2, 5, 6, 11, 12, 15, 16, 19, 22, 23], "batch_8acaf770": 4, "batch_ca3bfd8a": 4, "batch_cca4064f": 4, "batch_detail": 4, "batch_id": 4, "batch_job": 4, "batch_request": 4, "batch_respons": 4, "batchrequestcount": 4, "bathroom": 3, "beauti": 3, "becaus": [6, 10, 16], "becom": 3, "bedroom": 3, "been": 3, "befor": [6, 12, 19], "begin": [2, 4, 5, 6, 10, 23], "behind": 3, "beij": 4, "being": 16, "below": [8, 10, 19, 22], "bench_lat": [12, 20, 22], "bench_serv": 12, "benchmark": 11, "berlin": 9, "bespok": 9, "best": 3, "better": [1, 4, 16, 20, 22], "between": [1, 2, 3, 19], "bfloat16": [2, 4, 6, 23], "bia": [3, 10], "bias": 3, "bin": 8, "biohybrid": 3, "birthplac": 3, "bit": 3, "black": [2, 4, 5, 6, 23], "blend": 3, "blob": [6, 19], "block": [4, 6, 10, 23], "blog": 17, "blogpost": 9, "blood": 10, "blue": [2, 4, 5, 6, 23], "blurri": 6, "board": 6, "bodi": [4, 10], "bogart": 10, "bool": 19, "bordeaux": 3, "born": 10, "both": [16, 21], "bottleneck": 16, "bottom": 6, "boulevard": 3, "bound": 2, "bra": 4, "branch": 22, "bras\u00edlia": [4, 23], "brazil": [4, 23], "break": [19, 23], "breathtak": 3, "bridg": 4, "bright": 3, "bring": 3, "browser": 0, "bug": 4, "build": [1, 3, 6, 7, 22], "built": [3, 4, 22], "busi": 3, "bytesio": 6, "c": [2, 3, 4, 5, 6, 22, 23], "c163e2aaf2cd44888e220237d6c75ad1": 4, "c7f2": 4, "cab": 6, "cabernet": 3, "cach": [1, 4, 5, 6, 8, 11, 12, 15, 16, 18, 21, 22, 23], "cache_hit_r": 18, "cached_token": [2, 18, 23], "calcul": 10, "call": [3, 9, 10, 11], "campaign": 4, "can": [1, 2, 3, 4, 6, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "canberra": [4, 23], "cancel": 4, "cancelled_job": 4, "cannot": 19, "cap": 22, "capabl": 3, "capit": [1, 2, 3, 4, 9, 10, 19, 23], "captur": [2, 4, 6, 23], "car": 6, "case": [3, 6, 16], "cathedr": 3, "caus": [3, 15], "cd": [7, 13, 18, 22], "center": 4, "centric": 3, "centuri": [3, 4], "certifi": 3, "chain": 11, "challeng": 3, "chang": [3, 8, 15, 20], "chapter": 3, "charact": 10, "character_gen": 10, "character_regex": 10, "charm": 3, "chat": [1, 10, 11, 19, 20, 23], "chat_exampl": 10, "chat_templ": [2, 4, 5, 6, 14, 23], "chatcomplet": [4, 23], "chatcompletionmessag": [4, 23], "chatglm": 20, "chatml": [6, 14, 19, 20], "check": [1, 4, 22], "check_output": [5, 6, 23], "checkpoint": [1, 2, 3, 4, 5, 6, 12, 23], "china": 4, "choic": [4, 6, 10, 11, 23], "choices_method": 9, "chunk": [1, 3, 4, 6, 11, 19, 21, 23], "chunked_prefill_s": [2, 4, 5, 6, 23], "ci": 13, "citi": [3, 6, 23], "civil": 4, "clariti": [2, 4, 5, 6, 23], "class": [3, 19], "classifi": 23, "clean": [3, 4], "clear": 3, "cli": 12, "click": 18, "client": [1, 4, 12], "climat": 3, "clone": [0, 22], "cloth": 6, "cloud": 3, "cluster": 22, "co": [11, 14], "coach": 3, "code": [2, 4, 5, 6, 10, 12, 15, 20, 23], "collabor": 3, "color": [2, 4, 5, 6, 12, 23], "colosseum": 4, "columbia": 3, "com": [6, 7, 8, 12, 19, 22], "combin": [2, 4, 5, 6, 23], "come": 16, "command": [1, 5, 8, 12, 13, 20, 22], "commit": 13, "committe": 3, "common": 21, "commun": [3, 11], "compar": 20, "comparison": [9, 20], "compat": [2, 4, 5, 6, 10, 14, 19, 23], "competit": 3, "compil": [1, 16], "complet": [1, 2, 3, 5, 6, 10, 11, 23], "completion_token": [2, 4, 6, 23], "completion_tokens_detail": [4, 23], "completion_tokens_wo_jump_forward": [2, 23], "completion_window": 4, "completionchoic": 4, "completionusag": [4, 23], "complex": [3, 4, 10], "compon": 3, "compos": 18, "comput": [1, 3, 4, 6, 10, 12, 15, 16], "concept": 3, "concern": 3, "concis": 4, "conclus": 3, "conda": 22, "condit": 3, "confer": 3, "confid": 9, "config": [1, 12], "connect": [3, 10, 22], "conquest": 4, "consid": [3, 12, 19], "constrain": [1, 4, 11, 16, 19], "constrained_json_whitespace_pattern": [2, 4, 5, 6, 23], "constraint": [4, 10, 19], "construct": 6, "consul": 4, "contain": 9, "content": [1, 2, 4, 6, 10, 23], "context": 18, "context_len": [2, 4, 5, 6, 18, 23], "context_length": [2, 4, 5, 6, 23], "continu": [2, 10, 11], "contribut": 14, "contributor": 11, "control": [6, 11], "controversi": 3, "conv": 2, "convers": 14, "convert": 20, "copi": 22, "core": [10, 11], "corner": 6, "correct": [12, 19], "cost": 4, "could": [6, 19], "count": 4, "counter": 18, "countri": [1, 3, 4, 23], "cover": [4, 5, 6], "coverag": 20, "cpu": [1, 16], "cream": 3, "creat": [1, 4, 5, 6, 20, 23], "created_at": 4, "creativ": 4, "crisi": 3, "critic": [3, 12], "ctrl": [2, 4, 5, 6, 23], "cu121": 22, "cubla": 15, "cuda": [1, 2, 4, 5, 6, 8, 12, 15, 22, 23], "cuda_graph_max_b": [2, 4, 5, 6, 23], "cuda_visible_devic": 8, "cuisin": 23, "cultur": [3, 23], "curl": [1, 2, 8, 19], "curl_command": [6, 23], "curl_id": 5, "curl_text": 5, "currenli": [1, 16], "current": [3, 4, 6, 15, 18], "custom": [1, 3, 4, 11], "custom_id": 4, "custom_serv": 3, "d": [0, 1, 3, 4, 5, 6, 10, 12, 22, 23], "d7680014feae": 4, "dai": 3, "dark": 4, "data": [1, 2, 4, 5, 6, 16, 19, 22, 23], "dataclass": 19, "dataset": 12, "david": 3, "dbrx": 20, "deactiv": 22, "deadlock": 1, "death": 10, "deb": 12, "deceas": 10, "decis": 3, "declar": 3, "decod": [1, 2, 6, 11, 16, 21, 23], "decode_log_interv": [2, 4, 5, 6, 23], "decode_unicod": [19, 23], "decor": 10, "decreas": [16, 21], "deep": 3, "deepseek": [11, 20], "def": [1, 3, 9, 10], "default": [1, 6, 9, 14, 16, 19, 22], "defin": [10, 14], "del_respons": 4, "delai": 12, "delet": 4, "delete_ckpt_after_load": [2, 4, 5, 6, 23], "delta": [4, 23], "demonstr": 3, "depart": 3, "depend": 22, "deploi": 22, "deploy": [3, 22], "describ": [6, 9, 19], "descript": [12, 19], "design": [3, 11], "destin": 23, "detail": [2, 3, 4, 6], "detailed_tip": 10, "determin": 9, "detoken": 19, "dev": [1, 8, 22], "devel": 8, "develop": [3, 4, 12], "devic": [1, 2, 3, 4, 5, 6, 8, 22, 23], "devtool": 12, "dict": 19, "diet": 10, "differ": [2, 3, 6, 15, 20], "difficult": 19, "dimens": 2, "dine": 3, "dire": 3, "direct": 3, "directli": 1, "directori": 20, "disabl": [1, 12, 15, 19], "disable_cuda_graph": [2, 4, 5, 6, 23], "disable_cuda_graph_pad": [2, 4, 5, 6, 23], "disable_custom_all_reduc": [2, 4, 5, 6, 23], "disable_disk_cach": [2, 4, 5, 6, 23], "disable_flashinf": [2, 4, 5, 6, 23], "disable_flashinfer_sampl": [2, 4, 5, 6, 23], "disable_mla": [2, 4, 5, 6, 23], "disable_nan_detect": [2, 4, 5, 6, 23], "disable_pen": [2, 4, 5, 6, 23], "disable_radix_cach": [2, 4, 5, 6, 23], "disable_regex_jump_forward": [2, 4, 5, 6, 23], "dispatch": 15, "displai": [2, 4, 5, 6, 23], "dist_init_addr": [2, 4, 5, 6, 23], "distrib_releas": 12, "distribut": [2, 4, 5, 6, 23], "district": 3, "divers": 4, "dn": 10, "do": [3, 4, 6, 8, 12, 16, 19], "doc": [9, 12, 14, 19, 22], "doc_site_path": 0, "docker": [1, 18], "dockerfil": 22, "dockerx": 22, "document": [3, 14, 22], "doe": [1, 12, 16], "don": [3, 6], "donald": 9, "done": [8, 19, 23], "down": [3, 6, 9], "download": [12, 19], "dp": 1, "dp_size": [2, 4, 5, 6, 23], "dpkg": 12, "draw": 3, "dri": [8, 22], "drive": [3, 6], "driven": 3, "drought": 3, "drun": 22, "ds_channel_config_path": [2, 4, 5, 6, 23], "ds_heavy_channel_num": [2, 4, 5, 6, 23], "ds_heavy_channel_typ": [2, 4, 5, 6, 23], "ds_heavy_token_num": [2, 4, 5, 6, 23], "ds_sparse_decode_threshold": [2, 4, 5, 6, 23], "dtype": [1, 2, 4, 5, 6, 23], "duck": 9, "due": [3, 9, 16, 21], "dummi": 12, "dump": [4, 5, 19], "durat": 12, "dure": [1, 2, 4, 6, 16, 19, 21], "dynam": [12, 15], "e": [4, 8, 12, 20, 22], "e2e_request_latency_second": 18, "e2e_request_latency_seconds_bucket": 18, "e2e_request_latency_seconds_count": 18, "e2e_request_latency_seconds_sum": 18, "e5": [5, 11], "e8a2": 4, "each": 1, "earli": 16, "earlier": 9, "eas": 4, "easi": [11, 20, 21], "easier": 10, "easili": 3, "eater": 10, "echo": [8, 12], "ed879b81bfc4": 4, "edit": 8, "educ": 3, "effici": [1, 3, 11], "eiffel": 3, "either": 19, "element": 4, "eleutherai": 9, "elif": 10, "els": 4, "embed": [1, 4, 11, 23], "embedding_process": [2, 5, 6], "empathet": 3, "empir": 4, "empti": 1, "en": 14, "enabl": [1, 3, 4, 5, 6, 10, 15, 16, 22], "enable_cache_report": [2, 4, 5, 6, 23], "enable_double_spars": [2, 4, 5, 6, 23], "enable_metr": [2, 4, 5, 6, 23], "enable_mixed_chunk": [2, 4, 5, 6, 23], "enable_overlap_schedul": [2, 4, 5, 6, 23], "enable_p2p_check": [2, 4, 5, 6, 23], "enable_torch_compil": [2, 4, 5, 6, 23], "encod": [5, 19, 23], "encount": 22, "encourag": [4, 19], "end": [2, 3, 4, 5, 6, 10, 18, 19, 20, 23], "endpoint": [1, 4, 19, 22, 23], "engin": [4, 10, 11, 15], "england": 19, "enhanc": 3, "enough": [1, 16], "ensur": 3, "enthusiast": 3, "entranc": 3, "entrepreneur": 3, "entryclass": 20, "enumer": 10, "env": [1, 22], "environ": [1, 5, 8], "eo": [16, 19], "equival": [4, 5, 6, 15, 23], "era": 3, "error": [1, 2, 4, 16], "especi": [3, 16], "establish": [3, 4], "etc": [11, 12], "eth0": 1, "evalu": 2, "even": [2, 3, 9], "event": 3, "everyth": 3, "exampl": [2, 3, 4, 5, 8, 9, 18, 20, 22], "example_imag": [6, 19], "exaon": 20, "except": 4, "excit": 3, "exec": 12, "execut": [4, 6, 22, 23], "execute_shell_command": [2, 4, 5, 6, 23], "exercis": 10, "exist": [3, 4, 20], "expand": [4, 10], "experi": 3, "experiment": [1, 3, 6, 16], "expert": 3, "explain": 3, "explan": 3, "explicitli": 6, "explor": [3, 4], "export": [0, 1, 8, 10], "expos": 18, "express": [10, 19], "extend": 9, "extens": [11, 20], "extern": [10, 11], "extra_bodi": 4, "extrem": 3, "f": [1, 2, 3, 4, 5, 10, 18, 22], "f68cae25df17": 4, "f7d2d3ddffec4548a9ba9f11b51927af": 4, "face": [1, 4, 6, 14], "factor": [2, 15], "fail": [2, 4, 9], "failur": 22, "fair": 3, "fals": [2, 4, 5, 6, 19, 23], "famili": 3, "famou": 3, "far": 19, "fashion": [3, 23], "fast": 11, "faster": 11, "favor": 16, "fcf": 16, "featur": [1, 3, 6, 11], "fetch": 12, "few": 3, "field": 3, "file": [0, 4, 6, 12, 13, 14, 18, 19, 20, 21], "file_respons": 4, "file_storage_pth": [2, 4, 5, 6, 23], "fill": 10, "fillmor": 9, "final": [3, 4, 15], "find": [3, 10, 17, 20], "fine": 3, "finish_reason": [2, 4, 6, 23], "fire": [2, 4, 5, 6, 23], "first": [1, 2, 3, 5, 6, 10, 12, 16, 18], "firstli": 3, "fit": 3, "fix": 21, "flashinf": [2, 4, 5, 6, 11, 22, 23], "flexibl": [3, 11, 23], "flip": 6, "float": 19, "float16": [2, 5], "flood": 3, "flow": 11, "fluenci": 4, "flush": [3, 10, 19, 23], "flush_cach": 2, "focu": 3, "focus": [3, 4], "folder": [8, 12, 13], "follow": [1, 2, 4, 5, 8, 10, 12, 16, 18, 19, 20, 21], "foo": 4, "food": 3, "forb": 3, "forc": 3, "forefront": 3, "foreground": 6, "forev": 8, "fork": [10, 12], "form": 4, "format": [2, 3, 4, 5, 6, 10, 12, 19, 23], "forum": 4, "forward": [3, 11, 20], "forward_batch": 20, "foster": [2, 3], "found": [2, 3, 10], "founder": 3, "four": 3, "fourth": 3, "fp16": 1, "fp8": [1, 11, 16, 22], "fp8_e5m2": 1, "fraction": [1, 6, 21], "framework": [3, 11], "franc": [1, 2, 3, 4, 9, 10, 19, 23], "frequenc": 19, "frequency_penalti": [4, 19], "frequent": [3, 11, 16], "friend": 3, "from": [2, 3, 4, 5, 6, 10, 13, 14, 15, 23], "from_pretrain": [2, 5], "frontend": [14, 22], "full": [1, 3, 6, 16], "fulli": 4, "fun": 3, "function": [3, 6, 9, 10, 20], "function_cal": [4, 23], "further": 22, "futur": [1, 2, 3, 6, 20], "futurewarn": 6, "g": [6, 8, 12, 20, 22], "galleri": 3, "garden": 3, "garonn": 3, "gastronom": 3, "gaug": 18, "gb": [2, 4, 5, 6, 23], "gemini": 10, "gemma": [11, 18, 20], "gemma2forsequenceclassif": 20, "gen": [2, 4, 6, 9, 10, 16, 18, 23], "gen_throughput": 18, "gener": [0, 1, 4, 6, 11, 18, 19], "generatereqinput": 19, "generation_tokens_tot": 18, "get": [3, 4, 5, 6, 20, 22, 23], "get_memory_pool_s": 2, "get_model_info": [2, 4, 5, 6, 23], "get_server_arg": 2, "git": [8, 22], "github": [0, 6, 19, 21, 22], "githubusercont": 6, "give": [4, 8, 20], "given": [4, 19], "glad": 3, "glass": 3, "glm": 20, "global": 3, "gloo_socket_ifnam": 1, "gnupg": 12, "go": 3, "goal": 3, "good": [3, 16], "googl": [10, 18], "govern": [3, 4], "gpt": 10, "gptq": [2, 4, 5, 6, 11, 23], "gpu": [1, 8, 16, 18, 22], "grammar_backend": [2, 4, 5, 6, 23], "grand": 3, "grape": 3, "graph": [1, 2, 4, 6, 12, 23], "great": 3, "greater": 3, "greedy_token_select": 9, "grok": 20, "group": [2, 8, 22], "gryffindor": 10, "gte": [2, 5, 11, 20], "guarante": [4, 19], "guid": [11, 19, 22, 23], "h": 1, "h100": 22, "ha": [3, 16, 20], "haisgl": 8, "half": 10, "hand": 16, "handl": [12, 19, 23], "happen": [16, 21], "happi": 4, "harri": 10, "hasattr": 4, "have": [0, 1, 3, 4, 6, 9, 15, 16, 18], "health": 3, "health_gener": 2, "healthi": [10, 16], "hello": [1, 3], "help": [1, 3, 4, 10, 16, 18, 19, 20, 21], "henryx": 8, "here": [1, 2, 3, 4, 10, 18, 19, 23], "heritag": 3, "hf": 14, "hf_home": 8, "hf_token": [8, 22], "hf_xxx": 8, "hidalgo": 3, "high": [4, 9, 16, 19], "higher": 4, "highest": [9, 10], "highlight": [2, 4, 5, 6, 23], "hilltop": 3, "histogram": 18, "histor": 3, "histori": [3, 23], "historian": 4, "hit": [2, 4, 5, 6, 18, 19, 23], "home": [3, 22], "hood": 15, "host": [1, 2, 3, 4, 5, 6, 22, 23], "hostnam": 1, "hous": [3, 10], "how": [1, 3, 9, 10, 13], "html": [0, 12], "http": [0, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 18, 19, 22, 23], "hub": 22, "hufflepuff": 10, "hug": [1, 4, 6, 14], "huggingfac": [8, 14, 20, 22], "human": 3, "hyperparamet": [1, 11], "i": [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23], "icon": [3, 4, 6], "icra": 3, "id": [2, 4, 6, 19, 23], "ident": 20, "ignor": 19, "ignore_eo": 19, "im_end": [14, 19], "im_start": [14, 19], "imag": [3, 10, 19, 22], "image_data": 19, "image_fil": 10, "image_id": 22, "image_qa": 10, "image_url": 6, "impact": 3, "implement": [3, 4, 9, 15, 20, 23], "implic": 19, "implicitli": 6, "import": [1, 2, 3, 4, 5, 6, 10, 12, 16, 18, 19, 23], "imposs": 3, "improv": [2, 3, 4, 5, 6, 23], "includ": [3, 4, 10, 11], "incorrect": 9, "increas": 16, "incur": 9, "independ": [3, 22], "indetermin": 15, "index": [2, 4, 6, 12, 23], "indic": 16, "individu": 3, "industri": 11, "inequ": 3, "inf": [18, 19], "infer": [1, 19], "info": [3, 4, 5, 6, 23], "inform": [2, 4, 10, 19], "infra": 22, "infrastructur": [3, 4], "init": [1, 2, 4, 5, 6, 23], "initi": [4, 9, 15], "input": [1, 4, 10, 11, 12, 19, 22], "input_file_id": 4, "input_file_path": 4, "input_id": [5, 19], "input_ids_embed": 5, "insid": [3, 8], "instal": [0, 2, 4, 5, 6, 7, 8, 11, 12, 13, 23], "installationguid": 12, "instanc": 9, "instead": 21, "instinct": 22, "instruct": [1, 2, 3, 4, 5, 6, 10, 12, 19, 20, 22, 23], "int": 19, "int4": 11, "int4wo": 1, "integ": [4, 19], "integr": 11, "intellig": 4, "intend": 3, "interact": [3, 11], "interest": 3, "interfac": [11, 20], "interleav": 6, "intern": 3, "internet": 3, "internlm": 20, "internlm2": 20, "internlm2forrewardmodel": 20, "interpret": 4, "intfloat": 5, "introduc": [2, 15], "intuit": 11, "invest": 3, "investig": 15, "invok": 10, "io": [0, 6], "ip": [1, 10], "ipc": [1, 22], "iron": 6, "is_embed": [2, 4, 5, 6, 23], "is_gener": 2, "isgener": 3, "issu": [3, 6, 10, 15, 21, 22], "itali": 4, "iter_lin": [19, 23], "its": [2, 3, 4, 9], "jacket": 6, "japan": [4, 10, 23], "job": 4, "johnson": 3, "joke": 4, "json": [1, 2, 5, 6, 12, 18, 23], "json_decod": 10, "json_model_override_arg": [2, 4, 5, 6, 23], "json_output": 10, "json_schema": [4, 19], "jsonl": 4, "jump": 11, "just": [4, 14], "k": 19, "k8": 22, "keep": 4, "kei": [2, 10, 12], "kernel": [11, 15, 21, 22], "kfd": [8, 22], "kind": 4, "kingdom": 10, "knowledg": [3, 4], "known": [3, 4], "kv": [1, 16, 21, 22], "kv_cache_dtyp": [2, 4, 5, 6, 23], "l": [6, 19], "l4": 22, "l40": 22, "lab": [6, 19, 20], "label": 8, "lack": 3, "landmark": 3, "landscap": 3, "lang": [6, 19], "languag": [3, 4, 6, 11, 14, 22, 23], "larg": [2, 3, 6, 11, 12, 16], "larger": 3, "last": [4, 22], "late": 3, "latenc": 18, "later": [8, 9], "latest": [1, 22], "launch": [1, 3, 10, 12, 14, 19, 22], "launch_serv": [1, 2, 4, 5, 6, 10, 12, 14, 19, 20, 22, 23], "law": [3, 4], "layer": [15, 20], "layer_id": 20, "le": 18, "lead": 15, "learn": [1, 3, 4, 11, 13, 20], "least": 19, "leav": 3, "left": [3, 6], "len": [12, 19, 23], "length": [4, 10, 18, 19, 23], "less": [3, 4], "let": 1, "level": [3, 4, 19], "lib": 6, "librari": 10, "light": 4, "like": [3, 4, 16], "limit": [6, 9], "line": 4, "lint": 13, "linux": 8, "lisa": 3, "list": [1, 4, 6, 10, 12, 19, 20, 21, 23], "live": 3, "ll": 3, "llama": [2, 3, 4, 6, 10, 11, 12, 14, 19, 20, 22, 23], "llama3": [6, 20], "llama_3_vis": 6, "llamaembeddingmodel": 20, "llamaforcausallm": [2, 4, 23], "llamaforsequenceclassif": [2, 20], "llava": [6, 11, 19, 20], "llava_llama_3": [6, 20], "llm": [1, 3, 9, 11], "lm_eval": [2, 4, 5, 6, 23], "lmm": [6, 19, 20], "lmsysorg": [1, 22], "load": [1, 2, 3, 4, 5, 6, 12, 14, 16, 19, 23], "load_balance_method": [2, 4, 5, 6, 23], "load_format": [2, 4, 5, 6, 23], "load_imag": 19, "local": [4, 5, 6, 22], "local_example_llava_next": 10, "localhost": [0, 1, 2, 4, 5, 6, 10, 18, 19, 23], "locat": [3, 19], "log": [2, 4, 5, 6, 10, 16, 23], "log_level": [2, 4, 5, 6, 23], "log_level_http": [2, 4, 5, 6, 23], "log_request": [2, 4, 5, 6, 23], "logit": [10, 19, 20], "logitsprocessor": 20, "logo": 6, "logprob": [4, 6, 9, 19, 23], "logprob_start_len": 19, "london": [4, 9], "long": [1, 3, 4], "longer": [4, 6, 9], "longest": 16, "look": [3, 4, 14, 16], "loop": 10, "lora_path": [2, 4, 5, 6, 23], "lot": 3, "louvr": 3, "love": 23, "low": 19, "lower": [4, 16], "lpm": [2, 4, 5, 6, 16, 23], "lsb": 12, "lt": [2, 3, 4, 5, 6, 23], "m": [0, 1, 2, 3, 4, 5, 6, 10, 12, 14, 19, 20, 22, 23], "machin": [3, 22], "magazin": 3, "magic": 10, "mai": [2, 4, 5, 6, 10, 12, 15, 21, 23], "main": [1, 3, 6, 14, 19], "mainli": 2, "maintain": [2, 3, 20], "major": [4, 20], "make": [0, 3, 4, 11, 16, 20], "malici": 6, "man": 6, "manag": 10, "mani": [3, 9, 15, 16, 20], "manner": 19, "market": 3, "mask": 10, "match": [2, 16], "matched_stop": [4, 6, 23], "materi": 17, "math": 10, "mathemat": 15, "max": 21, "max_check": 4, "max_loras_per_batch": [2, 4, 5, 6, 23], "max_new_token": [1, 16, 19, 23], "max_prefill_token": [2, 4, 5, 6, 18, 23], "max_running_request": [2, 4, 5, 6, 18, 23], "max_token": [1, 4, 6, 10, 23], "max_total_num_token": [2, 4, 5, 6, 18, 23], "max_total_token": [2, 4, 5, 6, 23], "maximum": [18, 19], "mayor": 3, "md": [6, 13], "me": 4, "mean": [3, 16, 19], "mechan": 3, "media": 3, "medic": 3, "meet": [1, 3], "mem": [1, 2, 4, 5, 6, 21, 23], "mem_fraction_stat": [2, 4, 5, 6, 23], "memori": [1, 4, 5, 6, 12, 23], "mention": 6, "merlot": 3, "messag": [1, 2, 4, 6, 10, 23], "meta": [1, 2, 3, 4, 6, 10, 12, 14, 19, 22, 23], "meta_info": [2, 23], "method": [4, 11], "mi": 22, "micro": 3, "mid": 3, "midst": 3, "mild": 4, "mile": 4, "militari": 4, "millard": 9, "min": 19, "min_new_token": 19, "min_p": 19, "minicpm": 20, "ministri": 10, "minut": [2, 4, 6, 23], "mislead": 9, "miss": [3, 14], "mistral": [5, 11, 20], "misus": 3, "mix": 19, "mixtral": 20, "mllamaforconditionalgener": 6, "modal": [1, 11], "mode": [3, 4, 6, 15], "model": [3, 4, 5, 6, 8, 9, 11, 12, 14, 16, 18, 19, 22, 23], "model_path": [1, 2, 3, 4, 5, 6, 23], "moder": 4, "modul": 6, "moe": 20, "moistur": 3, "mona": 3, "monitor": [4, 18], "montmartr": 3, "moral": 2, "more": [1, 2, 3, 4, 6, 11, 15, 19, 22, 23], "most": [3, 14, 16, 20], "mostli": 15, "mount": 8, "movement": 3, "mr": 3, "much": 3, "muggl": 10, "multi": [1, 11], "multi_turn_quest": 10, "multimod": 6, "multipl": [1, 4], "museum": 3, "must": [2, 3, 19, 23], "my": [1, 3], "my_model": 14, "my_model_templ": 14, "n": [2, 3, 4, 10, 19, 23], "n1": [4, 23], "n2": [4, 23], "n3": [4, 23], "name": [1, 2, 3, 4, 8, 9, 10, 12, 14, 18, 19], "namespac": 18, "nanyang": 3, "nativ": 11, "natur": [3, 4], "nbecaus": 4, "nccl": 1, "ndescrib": 19, "necess": 3, "need": [2, 3, 6, 8, 10, 12, 14, 20, 22], "neighborhood": 3, "nemo": 20, "nervou": 3, "nest": 10, "network": [2, 3, 22], "neural": [2, 3], "neutrogena": 3, "new": [2, 3, 4, 5, 6, 7, 8, 11, 16, 18, 19, 23], "new_seq": 18, "new_token": 18, "new_token_ratio": 16, "next": [6, 20], "ngener": [1, 3], "ni": 2, "nice": 3, "night": 3, "nixon": 3, "nlist": 4, "nlp": [2, 5, 20], "nnode": [1, 2, 4, 5, 6, 23], "node": [1, 2, 12], "node_rank": [2, 4, 5, 6, 23], "non": [2, 10], "nondeterminist": 15, "none": [2, 4, 5, 6, 19, 23], "normal": 10, "note": [2, 4, 5, 6, 8, 12, 14, 19, 20, 23], "notebook": [2, 4, 5, 6, 23], "notic": 15, "novel": 4, "now": [2, 10, 22], "nprompt": 3, "npython": 4, "nrel": 4, "nroll": 2, "nsy": 12, "ntu": 3, "null": [6, 22], "num": 12, "num_continuous_decode_step": [2, 4, 5, 6, 23], "num_requests_run": 18, "num_requests_wait": 18, "number": [2, 16, 18, 19], "numer": [2, 15], "nvidia": [8, 12], "nvtx": 12, "nw": 3, "nyou": 19, "o": [4, 5, 8, 12, 19], "object": [4, 6, 19, 23], "obstacl": 3, "obtain": 9, "occasion": 16, "occup": 10, "off": 6, "offer": [3, 11], "offici": [3, 14], "offlin": [1, 11], "often": 3, "ok": [2, 4, 5, 6, 23], "okai": 16, "olai": 3, "olmo": 20, "omit": 9, "onc": [1, 2, 5, 6, 9, 23], "one": [2, 3, 4, 6, 9, 10, 15, 19], "onevis": [6, 19, 20], "onli": [2, 3, 4, 6, 9, 10, 12, 15, 19, 20, 22], "onlin": 12, "only_run": 20, "oom": [16, 21], "open": [4, 6, 11, 22], "openai": [2, 9, 11, 14, 19, 20, 22], "openai_api_kei": [8, 10], "oper": [2, 22], "opinion": 3, "opt": 22, "option": [3, 9, 19], "orang": 6, "order": 10, "organ": 3, "organiz": 2, "origin": [2, 4, 5, 6, 23], "other": [2, 3, 4, 6, 9, 16, 20, 22], "otherwis": 6, "our": 15, "out": [1, 4, 10, 12, 22], "outcom": 3, "outlet": 3, "outlin": [2, 4, 5, 6, 23], "output": [1, 2, 3, 4, 5, 6, 12, 15, 18, 19, 20, 22, 23], "output_file_id": 4, "outsid": 3, "ov": [6, 19, 20], "over": [3, 4], "overcom": 3, "overhead": [3, 16], "overlap": [1, 9, 16], "overrid": 14, "overse": 3, "own": [1, 22], "p": [1, 18, 19, 22], "p2p": 1, "packag": 6, "pad": 15, "page": [4, 11, 21], "paint": 3, "pairwis": 2, "pantheon": 4, "paper": 3, "paragraph": 10, "parallel": [1, 11, 16, 19], "paramet": [2, 11, 16, 21, 23], "pari": [2, 3, 4, 9, 19, 23], "park": 3, "part": 20, "particip": 3, "particularli": 3, "pass": [10, 13, 20], "passion": 3, "path": [0, 1, 2, 4, 5, 6, 9, 10, 12, 14, 19, 20, 22, 23], "patronu": 10, "pattern": [4, 19], "peer": 1, "penal": 19, "penalti": [4, 19], "pennsylvania": 3, "peopl": 3, "per": [15, 18], "perform": [1, 2, 9, 19], "perpetu": 3, "pertain": 2, "ph": 3, "phoenix": 10, "phrase": 4, "pickl": 6, "pip": [0, 7, 8, 12], "pip3": 13, "place": 3, "plan": [3, 22], "playground": 20, "pleas": [1, 2, 6, 10, 21, 22], "png": [6, 19], "podcast": 3, "pool": [1, 4, 5, 6, 16, 21, 23], "poorli": 9, "popul": [4, 19], "popular": [3, 4, 6], "port": [1, 2, 4, 5, 6, 10, 14, 19, 22, 23], "portion": 15, "possibl": 6, "post": [2, 4, 5, 6, 19, 23], "post3_vllm0": 8, "potenti": [3, 15], "potter": 10, "pre": 13, "predict": [3, 9], "prefer": 4, "prefil": [1, 2, 4, 5, 6, 11, 12, 18, 20, 21, 23], "prefix": [11, 15, 16], "premier": 3, "prerequisit": 12, "presence_penalti": [4, 19], "presid": [1, 3, 9], "press": [2, 3, 4, 5, 6, 23], "prev": [19, 23], "prevent": 3, "primarili": 3, "primit": [9, 10], "princip": 3, "print": [1, 3, 4, 10, 12, 19, 23], "print_highlight": [2, 3, 4, 5, 6, 23], "priorit": [2, 3], "probabl": 10, "problem": 3, "process": [2, 3, 4, 5, 6, 18, 23], "produc": 3, "product": 3, "profil": 11, "program": [4, 11, 22], "programm": 4, "progress": 15, "progress_bar": 10, "project": [0, 6, 7, 8, 14, 15, 17, 19, 22], "prometheu": 18, "promot": [2, 3], "prompt": [1, 2, 3, 4, 10, 11, 12, 19], "prompt_token": [2, 4, 6, 23], "prompt_tokens_detail": [4, 6, 23], "prompt_tokens_tot": 18, "proper": 22, "properti": [4, 19], "provid": [1, 2, 3, 4, 5, 6, 10, 11, 12, 22, 23], "pub": 12, "pull": 8, "pure": 10, "purpos": 4, "py": [0, 2, 3, 4, 5, 6, 7, 10, 12, 14, 19, 20, 23], "pydant": 10, "pyproject": 7, "pyramid": 3, "python": [1, 2, 3, 4, 7, 10, 12, 14, 19, 20, 22], "python3": [0, 1, 2, 6, 8, 12, 19, 20, 22], "pytorch": [6, 15, 22], "q": 10, "qk": [2, 4, 5, 6, 23], "qualiti": [2, 3], "quantiz": [1, 2, 4, 5, 6, 11, 22, 23], "queri": 20, "question": [10, 11], "question_1": 10, "question_2": 10, "queu": 18, "queue": [2, 4, 5, 6, 16, 23], "queue_req": 18, "quick": [11, 12], "quick_start": 10, "quickli": 3, "quit": [2, 4, 5, 6, 23], "qwen": [1, 6, 11, 20], "qwen2": [1, 2, 5, 6, 19, 20], "qwen2forcausallm": [2, 5], "r": [0, 10, 20], "radix": [2, 12, 15], "radixattent": [11, 20], "rais": [2, 4], "random": [12, 15], "random_se": [2, 4, 5, 6, 23], "rang": [2, 4, 11, 16], "rank": 1, "rapid": 3, "rate": [2, 4, 5, 6, 18, 23], "rather": 3, "ravenclaw": 10, "raw": [6, 19], "rb": 4, "re": [3, 4], "reach": [3, 19], "read": [3, 4], "readabl": 4, "readi": [2, 4, 5, 6, 23], "readm": 13, "readme_exampl": 10, "real": 12, "reason": [3, 4], "recognit": 3, "recogniz": 3, "recommend": [6, 12, 22], "recoveri": 22, "reduc": [1, 4, 16, 21], "refer": [1, 4, 5, 6, 20, 21, 23], "reference_hf": 20, "refus": [4, 23], "regenerist": 3, "regex": 10, "region": 3, "regul": 3, "regular": [10, 19], "regular_expression_gen": 10, "relat": [6, 14, 22], "relationship": 2, "releas": [6, 12, 22], "remain": 15, "remark": 3, "rememb": [5, 6], "remot": [2, 20, 22], "remov": [0, 4, 20], "renown": 3, "repair": 3, "repeat": 19, "repetit": 4, "repetition_penalti": 19, "replac": [1, 3, 20, 22], "repo": 12, "report": [1, 21], "reproduc": 4, "req": [2, 4, 5, 6, 16, 23], "request": [1, 2, 4, 10, 11, 15, 18, 19, 21], "request_count": 4, "request_generation_token": 18, "request_generation_tokens_bucket": 18, "request_generation_tokens_count": 18, "request_generation_tokens_sum": 18, "request_id": 4, "request_prompt_token": 18, "request_prompt_tokens_bucket": 18, "request_prompt_tokens_count": 18, "request_prompt_tokens_sum": 18, "requir": [0, 3, 4, 19], "research": 3, "resid": 3, "resign": 3, "resourc": [20, 22], "respons": [1, 2, 4, 5, 6, 9, 19, 23], "response1": 2, "response2": 2, "response_format": 4, "response_json": 2, "restart": [2, 8], "result": [3, 4, 9], "result_cont": 4, "result_file_id": 4, "retracted_req": 16, "retriev": 4, "return": [6, 19], "return_logprob": 19, "return_text_in_logprob": 19, "reus": 20, "reward": [11, 23], "reward_process": 2, "rich": 3, "richard": 3, "rid": 19, "river": 3, "rm": [8, 22], "rmsnorm": 20, "road": 4, "robot": 3, "rocm": 22, "rocm620": 22, "role": [1, 2, 4, 6, 23], "roll": [2, 4, 5, 6, 23], "roman": 4, "romanc": [3, 23], "rome": 4, "rongxiang": 3, "room": 3, "root": [1, 15, 22], "roughli": 15, "round_robin": [2, 4, 5, 6, 23], "routin": 3, "run": [0, 2, 3, 4, 5, 6, 10, 12, 13, 15, 18, 20, 21, 23], "run_batch": 10, "runner": 6, "runner_allow_runasroot": 8, "runtim": [2, 11, 22], "runtimeendpoint": [9, 10], "safetensor": [2, 3, 4, 5, 6, 23], "sai": 4, "saint": 3, "same": [2, 10, 12, 15, 19, 20], "sampl": [2, 11, 20, 22, 23], "sampling_backend": [2, 4, 5, 6, 23], "sampling_param": [1, 3, 19, 23], "saturdai": 3, "sauvignon": 3, "scalabl": 3, "scale": [3, 22], "schedul": [1, 3], "schedule_conserv": [2, 4, 5, 6, 23], "schedule_polici": [2, 4, 5, 6, 23], "schema": [4, 10, 19], "school": 3, "scientif": 4, "script": [3, 20], "sculpt": 3, "search": 10, "seccomp": 22, "seciton": 21, "second": [4, 6, 18], "secondli": 3, "secret": 22, "section": 19, "secur": [6, 22], "see": [1, 3, 6, 10, 16, 19, 20, 21, 22, 23], "seed": [4, 15], "seema": 3, "select": [10, 18, 22], "self": [4, 5, 6], "sell": 3, "senat": 4, "send": [1, 6, 11, 15, 16, 19], "sentenc": [6, 19], "sep": 14, "sep_styl": 14, "separ": [2, 4, 5, 6, 23], "seq": [2, 4, 5, 6, 23], "sequenc": [4, 18], "seri": 4, "serial": 6, "serv": [1, 3, 11, 12, 16, 22], "served_model_nam": [2, 4, 5, 6, 23], "server": [0, 3, 10, 12, 14, 15, 16, 18, 19, 20], "server_arg": [2, 4, 5, 6, 23], "server_process": [2, 4, 23], "serverarg": [2, 4, 5, 6, 23], "servic": [4, 5, 6, 22], "service_ti": [4, 23], "set": [1, 6, 10, 12, 14, 15, 19, 22], "set_default_backend": 10, "sever": [2, 3, 4, 6, 12, 23], "sgl": [0, 1, 3, 6, 7, 8, 9, 10, 15, 17, 19, 22], "sgl0": 8, "sgl_branch": 22, "sglang": [2, 3, 4, 5, 6, 7, 8, 12, 13, 15, 17, 18, 23], "sglang_is_in_ci": 8, "sglang_storag": [2, 4, 5, 6, 23], "sglang_use_modelscop": 1, "sh": 7, "shape": 3, "shard": [2, 3, 4, 5, 6, 23], "share": [3, 8, 16], "shell": [5, 6, 23], "shirt": 6, "shm": [8, 22], "short": [4, 15, 19], "shorter": 9, "should": [4, 14, 20], "show": [6, 10], "show_time_cost": [2, 4, 5, 6, 23], "showcas": 4, "shutdown": 3, "sigmoid": 2, "sign": 3, "signific": 3, "siluandmul": 20, "similar": [2, 4, 19, 20], "similarli": 15, "simpl": 10, "simpli": 9, "simplic": 4, "simul": 4, "sinc": 4, "singapor": 3, "singl": [1, 4, 12, 19, 20, 22], "singleton": 2, "site": [3, 6], "situat": 3, "size": [1, 6, 8, 12, 15, 20, 21, 22], "sk": [8, 10], "skill": [2, 4], "skin": 3, "skincar": 3, "skip": 19, "skip_special_token": 19, "skip_tokenizer_init": [2, 4, 5, 6, 23], "sky": 22, "skyserv": 22, "skywork": [2, 11, 20], "sleep": [4, 8], "slide": 17, "slight": 15, "slightli": [4, 15], "slower": 15, "slytherin": 10, "sm75": 22, "small": [1, 6, 15, 16], "smaller": 1, "smollm": 20, "smooth": [4, 5, 6], "snippet": [6, 12], "so": [2, 3, 4, 5, 6, 12, 19, 23], "social": 3, "soft": 3, "solut": [3, 15], "some": [3, 8, 10, 12, 20, 21], "sometim": 21, "soon": 3, "sought": 3, "sourc": [11, 12], "space": [4, 19], "spaces_between_special_token": 19, "speak": 15, "speaker": 3, "special": [3, 19], "specif": [1, 20, 22], "specifi": [1, 4, 6, 8, 9, 10, 14, 19], "split": 4, "srt": [11, 19, 20, 22], "stabl": 4, "stablelm": 20, "stai": 10, "stand": [6, 16], "start": [2, 4, 5, 6, 18, 19, 20], "startswith": [19, 23], "startup": [2, 4, 5, 6, 23], "state": [1, 3, 4, 10], "static": [1, 6, 12, 21], "statu": [4, 10, 22], "status_cod": 4, "still": [2, 15], "stop": [2, 4, 6, 10, 16, 19, 23], "stop_str": 14, "stop_token_id": 19, "storag": 6, "stori": 4, "str": 19, "strategi": 1, "stream": [1, 4], "stream_interv": [2, 4, 5, 6, 23], "street": 6, "string": [4, 16, 19], "strip": [4, 19, 23], "strong": [4, 9], "structur": [3, 11], "struggl": 3, "student": [3, 10], "stun": 3, "style": 3, "subprocess": [5, 6, 23], "subset": 9, "succeed": 2, "success": [2, 3], "successfulli": [2, 4], "suggest": 16, "summari": 10, "suppli": 9, "support": [1, 2, 3, 4, 5, 6, 9, 10, 11, 19, 22], "sure": [0, 3, 20], "suscept": 3, "switch": 22, "symbol": 3, "symptom": 3, "sys_ptrac": 22, "system": [1, 3, 4, 10, 12, 14, 19, 22], "system_fingerprint": [4, 23], "t": [3, 6, 22], "t4": 22, "tabl": 4, "take": [2, 4, 6, 16, 23], "taxi": 6, "teacher": 10, "techniqu": 3, "technolog": 3, "technologi": 3, "tee": 12, "tell": 4, "temperatur": [1, 3, 4, 6, 10, 19, 23], "templat": [1, 4, 10, 11, 19, 20], "temporari": 3, "tensor": [1, 2, 11], "term": 15, "termin": [2, 4, 5, 6, 22, 23], "terminate_process": [2, 4, 5, 6, 23], "territori": 4, "test": [2, 3, 4, 6, 8, 12, 19, 23], "test_generation_model": 20, "test_oth": 20, "test_vision_openai_serv": 20, "testgenerationmodel": 20, "text": [1, 3, 4, 5, 6, 19, 20, 23], "text_complet": 4, "text_embed": 5, "text_it": 10, "text_qa": 10, "than": 3, "thei": [3, 4, 6, 19], "them": [3, 21, 22], "therefor": 2, "thi": [0, 1, 2, 3, 4, 5, 6, 9, 10, 12, 14, 15, 16, 19, 20, 21, 22, 23], "thing": [3, 16], "think": 3, "through": [4, 10], "throughput": [1, 2, 4, 6, 18, 23], "till": 22, "time": [1, 2, 3, 4, 5, 12, 15, 18, 19], "time_per_output_token_second": 18, "time_per_output_token_seconds_bucket": 18, "time_per_output_token_seconds_count": 18, "time_per_output_token_seconds_sum": 18, "time_to_first_token_second": 18, "time_to_first_token_seconds_bucket": 18, "time_to_first_token_seconds_count": 18, "time_to_first_token_seconds_sum": 18, "tip": 21, "tip_suggest": 10, "tmp": 8, "todai": [1, 3], "togeth": [1, 2, 3, 4, 5, 6, 16, 23], "token": [1, 2, 4, 5, 6, 10, 11, 14, 16, 18, 19, 22, 23], "token_id": 19, "token_length_norm": 9, "token_usag": 18, "tokenizer_mod": [2, 4, 5, 6, 23], "tokenizer_path": [2, 4, 5, 6, 23], "tokenizers_parallel": 5, "tokyo": [4, 23], "toml": 7, "too": [4, 16], "tool": 10, "tool_cal": [4, 23], "tool_us": 10, "top": [3, 19], "top_k": 19, "top_logprobs_num": 19, "top_p": [1, 3, 4, 19], "topic": 4, "torch": [1, 2, 4, 5, 6, 16, 23], "torch2": 22, "torch_compile_max_b": [2, 4, 5, 6, 23], "torchao": 1, "torchao_config": [2, 4, 5, 6, 23], "total": [1, 4, 18], "total_token": [4, 6, 23], "tower": 3, "tp": [1, 20, 22], "tp0": [2, 4, 5, 6, 23], "tp_size": [2, 4, 5, 6, 23], "tr": 12, "trace": 12, "track": 15, "tradit": 3, "train": [2, 3, 12], "transform": [2, 5, 14, 20], "transit": [4, 5, 6], "transpar": 3, "treat": 2, "trigger": 2, "triton": 22, "triton_attention_reduce_in_fp32": [2, 4, 5, 6, 23], "troubleshoot": 11, "true": [1, 2, 3, 4, 5, 6, 8, 10, 12, 19, 23], "truncat": 12, "trust": 20, "trust_remote_cod": [2, 4, 5, 6, 23], "try": [1, 4, 21], "tune": [1, 11, 21], "turbo": 10, "turn": [6, 10], "tutori": [4, 5, 6], "twice": 15, "twine": 7, "two": [1, 3, 4, 6, 10, 14, 15, 20], "txt": 0, "type": [1, 2, 3, 4, 5, 6, 18, 19, 23], "typic": [2, 4, 5, 6, 23], "u": 9, "ubiquit": 3, "ubuntu": 12, "ubuntu1804": 12, "ubuntu22": 8, "unconditional_likelihood_norm": 9, "unconfin": 22, "under": [12, 13, 15, 20], "understand": [3, 20], "union": 19, "unit": [1, 3, 4, 10, 12], "uniti": 2, "unittest": 20, "univers": 3, "unless": 6, "unlock": 3, "unnecessari": 3, "unpickl": 6, "until": 19, "untrust": 6, "up": [2, 3, 4, 5, 6, 18, 22, 23], "updat": [0, 3, 8, 12], "update_weight": 2, "upgrad": 22, "upload": [4, 18], "upload_pypi": 7, "uploaded_fil": 4, "upon": [1, 2, 5], "url": [2, 4, 6, 19, 23], "us": [2, 3, 4, 8, 9, 12, 13, 14, 15, 16, 18, 19], "us_president_exampl": 9, "usabl": [2, 4, 5, 6, 23], "usag": [1, 2, 5, 6, 9, 16, 18, 21, 22, 23], "user": [1, 2, 3, 4, 6, 9, 10, 14, 16, 19, 23], "usual": 4, "utf": [4, 19, 23], "util": [2, 3, 4, 5, 6, 16, 19, 23], "uvicorn": [2, 4, 5, 6, 23], "v": [1, 8, 22], "v0": [2, 20, 22], "v1": [1, 2, 4, 5, 6, 23], "valid": 4, "valu": [1, 6, 16, 19], "valuabl": 20, "variabl": [1, 8], "varianc": 15, "variant": 12, "varieti": 3, "variou": [1, 3, 4], "vast": 4, "ve": 3, "veri": [4, 6, 16, 19, 20], "verifi": 4, "version": 22, "vertexai": 10, "via": [6, 18], "vicuna_v1": 6, "video": [8, 17, 22], "view": [1, 3], "vision": [1, 4, 11, 20, 23], "visit": [0, 3, 23], "visitor": 3, "vl": [6, 20], "vl2": 6, "w": [4, 10, 19], "wa": [3, 4], "wai": 20, "wait": [2, 4, 5, 6, 18, 23], "wait_for_serv": [2, 4, 5, 6, 23], "waiting_request_latency_second": 18, "waiting_request_latency_seconds_bucket": 18, "waiting_request_latency_seconds_count": 18, "waiting_request_latency_seconds_sum": 18, "wand": 10, "want": [1, 3, 19], "warm": 3, "warn": 16, "washington": [3, 4], "watchdog_timeout": [2, 4, 5, 6, 23], "water": 3, "we": [1, 2, 3, 4, 5, 6, 15, 23], "weather": 3, "web": 4, "weekend": 3, "weight": [1, 3, 4, 5, 6, 8, 12, 23], "weight_util": [2, 3, 4, 5, 6, 23], "weights_onli": 6, "welcom": 14, "well": 20, "were": 4, "what": [2, 4, 6, 9, 10, 23], "when": [2, 4, 10, 14, 15, 16, 19], "where": [3, 6, 9], "whether": [2, 19], "which": [3, 4, 6, 14, 15, 16, 19, 21, 23], "while": [2, 3, 4, 5, 6, 8, 12, 15, 19, 22, 23], "white": [3, 6], "whl": 22, "who": [3, 4, 23], "why": 4, "wide": [4, 11], "wider": 3, "wine": 3, "within": 10, "without": [2, 3, 4, 12, 22], "won": 3, "wood": 10, "word": [4, 10], "work": [1, 3, 8, 14, 16], "workflow": 10, "workload": [1, 16], "workplac": 3, "world": 3, "would": 4, "wrinkl": 3, "write": [0, 4], "writer": 3, "x64": [6, 8], "x86_64": 12, "xai": 3, "xvers": 20, "xxx": 8, "y": [8, 12], "yaml": [18, 22], "year": 3, "yellow": 6, "yi": 20, "yml": 22, "you": [0, 1, 2, 3, 4, 6, 8, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "your": [0, 1, 3, 4, 6, 10, 11, 22, 23], "zip": [1, 3]}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Native APIs", "Offline Engine API", "OpenAI APIs - Completions", "OpenAI APIs - Embedding", "OpenAI APIs - Vision", "PyPI Package Release Process", "Set Up Self-Hosted Runners for GitHub Action", "Choices Methods in SGLang", "Frontend: Structured Generation Language (SGLang)", "SGLang Documentation", "Benchmark and Profiling", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Frequently Asked Questions", "Guide on Hyperparameter Tuning", "Learn more", "Production Metrics", "Sampling Parameters in SGLang Runtime", "Supported Models", "Troubleshooting", "Install SGLang", "Quick Start: Sending Requests"], "titleterms": {"0": 15, "1": [1, 8, 22], "2": [8, 22], "3": [1, 8, 22], "4": 22, "405b": 1, "5": 22, "A": [2, 4, 5, 6, 23], "The": 15, "With": 22, "access": 21, "achiev": 16, "action": 8, "add": [8, 13, 20], "addit": 1, "advanc": 16, "an": 21, "api": [1, 2, 3, 4, 5, 6, 23], "ar": 15, "arg": 2, "argument": 1, "ask": 15, "asynchron": 3, "avoid": 16, "backend": [1, 11], "batch": [3, 4, 10], "benchmark": 12, "build": 0, "cach": 2, "chat": [4, 6, 14], "check": 2, "choic": 9, "chunk": 16, "classifi": 2, "clean": 0, "client": [5, 6, 23], "cloud": 22, "code": [7, 13], "common": 22, "compat": 1, "complet": 4, "compos": 22, "config": 8, "configur": 8, "conserv": 16, "constrain": 10, "contain": 8, "contributor": 13, "control": 10, "correct": 20, "cuda": 21, "curl": [5, 6, 23], "custom": 14, "dashboard": 18, "debug": 20, "decod": [4, 10, 19], "depend": 0, "deploi": 0, "detail": 10, "determinist": 15, "docker": [8, 22], "document": [0, 11], "dp": 16, "embed": [2, 5, 20], "encod": 2, "encount": 21, "engin": [1, 3], "error": 21, "even": 15, "exampl": [1, 10, 19], "express": 4, "featur": 10, "flow": 10, "flush": 2, "format": [13, 14], "fraction": 16, "frequent": 15, "from": [1, 20, 22], "frontend": [10, 11], "gener": [2, 3, 10, 20, 23], "get": [2, 11], "github": [7, 8], "grafana": 18, "greedi": 9, "guid": [13, 16, 18], "health": 2, "host": 8, "how": 20, "http": 1, "hyperparamet": 16, "id": 5, "illeg": 21, "imag": 6, "implement": 10, "infer": 3, "info": 2, "input": [5, 6], "instal": 22, "interact": 20, "jinja": 14, "json": [4, 10, 14, 19], "kubernet": 22, "languag": 10, "launch": [2, 4, 5, 6, 23], "learn": 17, "length": 9, "likelihood": 9, "llama": 1, "local": 10, "make": 7, "max": 16, "mem": 16, "memori": [2, 16, 21], "method": [9, 22], "metric": 18, "modal": [10, 19], "model": [1, 2, 10, 20], "modelscop": 1, "more": [10, 17], "multi": [10, 19], "multipl": 6, "nativ": [2, 23], "new": 20, "non": 3, "normal": [9, 19], "note": 22, "nsight": 12, "offlin": 3, "openai": [1, 4, 5, 6, 10, 23], "option": 16, "other": 12, "out": [16, 21], "packag": 7, "parallel": 10, "paramet": [4, 19], "peak": 16, "pip": 22, "polici": 16, "pool": 2, "port": 20, "prefil": 16, "preview": 0, "process": 7, "product": 18, "profil": 12, "pypi": 7, "python": [5, 6, 23], "question": 15, "quick": [1, 10, 23], "refer": 11, "regex": [4, 19], "regular": 4, "releas": 7, "request": [5, 6, 16, 23], "result": 15, "reward": [2, 20], "role": 10, "run": [1, 8, 16, 22], "runner": 8, "runtim": [1, 14, 19], "sampl": 19, "schedul": 16, "select": 9, "self": 8, "send": 23, "serv": 0, "server": [1, 2, 4, 5, 6, 23], "set": 8, "setup": 18, "sglang": [0, 1, 9, 10, 11, 14, 19, 20, 22], "sh": 8, "size": [2, 16], "skypilot": 22, "sourc": 22, "speed": 16, "srt": 1, "start": [1, 8, 10, 11, 23], "static": 16, "step": 8, "stream": [3, 10, 19, 23], "structur": [4, 10, 19], "submiss": 16, "suit": 20, "support": 20, "synchron": 3, "temperatur": 15, "templat": [6, 14], "test": [13, 20], "text": 2, "throughput": 16, "tip": [10, 12], "token": 9, "tp": 16, "troubleshoot": 21, "try": 16, "tune": 16, "tutori": 11, "uncondit": 9, "unit": 13, "up": 8, "updat": [2, 7], "upload": 7, "us": [1, 5, 6, 10, 22, 23], "usag": 4, "version": 7, "vision": 6, "vllm": 20, "wa": 21, "websit": 0, "weight": 2, "without": 1, "your": [13, 16]}})