diff --git a/.github/workflows/build-site.yaml b/.github/workflows/build-site.yaml index f9e96a75..76a37414 100644 --- a/.github/workflows/build-site.yaml +++ b/.github/workflows/build-site.yaml @@ -16,6 +16,7 @@ jobs: - name: Configuring build Environment run: | sudo apt-get update + python -m pip install -U pip - name: Setup Ruby uses: ruby/setup-ruby@v1 @@ -24,13 +25,10 @@ jobs: - name: Installing dependencies run: | + python -m pip install -r docs/requirements.txt gem install jekyll jekyll-remote-theme jekyll-sass-converter - - name: Build site - run: | - cd site && jekyll b && cd .. - - - name: Push to gh-pages branch + - name: Build and deploy site if: github.ref == 'refs/heads/main' run: | git remote set-url origin https://x-access-token:${{ secrets.MLC_GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY diff --git a/README.md b/README.md index f2c8afe3..5a31f245 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ **High-Performance In-Browser LLM Inference Engine.** -[Get Started](#get-started) | [Blogpost](https://blog.mlc.ai/2024/06/13/webllm-a-high-performance-in-browser-llm-inference-engine) | [Examples](examples) | [Documentation](https://mlc.ai/mlc-llm/docs/deploy/webllm.html) +[Documentation](https://webllm.mlc.ai/docs/) | [Blogpost](https://blog.mlc.ai/2024/06/13/webllm-a-high-performance-in-browser-llm-inference-engine) | [Paper](https://arxiv.org/abs/2412.15803) | [Examples](examples) @@ -374,7 +374,7 @@ npm install npm run build ``` -Then, to test the effects of your code change in an example, inside `examples/get-started/package.json`, change from `"@mlc-ai/web-llm": "^0.2.77"` to `"@mlc-ai/web-llm": ../..`. +Then, to test the effects of your code change in an example, inside `examples/get-started/package.json`, change from `"@mlc-ai/web-llm": "^0.2.78"` to `"@mlc-ai/web-llm": ../..`. Then run: @@ -455,6 +455,21 @@ This project is initiated by members from CMU Catalyst, UW SAMPL, SJTU, OctoML, This project is only possible thanks to the shoulders open-source ecosystems that we stand on. We want to thank the Apache TVM community and developers of the TVM Unity effort. The open-source ML community members made these models publicly available. PyTorch and Hugging Face communities make these models accessible. We would like to thank the teams behind Vicuna, SentencePiece, LLaMA, and Alpaca. We also would like to thank the WebAssembly, Emscripten, and WebGPU communities. Finally, thanks to Dawn and WebGPU developers. +## Citation +If you find this project to be useful, please cite: + +``` +@misc{ruan2024webllmhighperformanceinbrowserllm, + title={WebLLM: A High-Performance In-Browser LLM Inference Engine}, + author={Charlie F. Ruan and Yucheng Qin and Xun Zhou and Ruihang Lai and Hongyi Jin and Yixin Dong and Bohan Hou and Meng-Shiun Yu and Yiyan Zhai and Sudeep Agarwal and Hangrui Cao and Siyuan Feng and Tianqi Chen}, + year={2024}, + eprint={2412.15803}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/2412.15803}, +} +``` + ## Contributors diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..3449de1e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= python -m sphinx +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..41cbd8d2 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,30 @@ +# WebLLM Documentation + +The documentation was built upon [Sphinx](https://www.sphinx-doc.org/en/master/). + +## Dependencies + +Run the following command in this directory to install dependencies first: + +```bash +pip3 install -r requirements.txt +``` + +## Build the Documentation + +Then you can build the documentation by running: + +```bash +make html +``` + +## View the Documentation + +Run the following command to start a simple HTTP server: + +```bash +cd _build/html +python3 -m http.server +``` + +Then you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending ` -p PORT_NUMBER` in the python command above). diff --git a/docs/_static/img/mlc-logo-with-text-landscape.svg b/docs/_static/img/mlc-logo-with-text-landscape.svg new file mode 100644 index 00000000..e122d32f --- /dev/null +++ b/docs/_static/img/mlc-logo-with-text-landscape.svg @@ -0,0 +1,87 @@ + +image/svg+xml + + + + + + + + + + + + + + + + diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..c3ea8c7f --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +import os +import sys + +import tlcpack_sphinx_addon + +# -- General configuration ------------------------------------------------ + +sys.path.insert(0, os.path.abspath("../python")) +sys.path.insert(0, os.path.abspath("../")) +autodoc_mock_imports = ["torch"] + +# General information about the project. +project = "web-llm" +author = "WebLLM Contributors" +copyright = "2023, %s" % author + +# Version information. + +version = "0.2.78" +release = "0.2.78" + +extensions = [ + "sphinx_tabs.tabs", + "sphinx_toolbox.collapse", + "sphinxcontrib.httpdomain", + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx_reredirects", +] + +redirects = {"get_started/try_out": "../index.html#getting-started"} + +source_suffix = [".rst"] + +language = "en" + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# A list of ignored prefixes for module index sorting. +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Options for HTML output ---------------------------------------------- + +# The theme is set by the make target +import sphinx_rtd_theme + +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +templates_path = [] + +html_static_path = [] + +footer_copyright = "© 2023 MLC LLM" +footer_note = " " + +html_logo = "_static/img/mlc-logo-with-text-landscape.svg" + +html_theme_options = { + "logo_only": True, +} + +header_links = [ + ("Home", "https://webllm.mlc.ai/"), + ("GitHub", "https://github.com/mlc-ai/web-llm"), + ("Discord", "https://discord.gg/9Xpy2HGBuD"), +] + +header_dropdown = { + "name": "Other Resources", + "items": [ + ("WebLLM Chat", "https://chat.webllm.ai/"), + ("MLC Course", "https://mlc.ai/"), + ("MLC Blog", "https://blog.mlc.ai/"), + ("MLC LLM", "https://llm.mlc.ai/"), + ], +} + +html_context = { + "footer_copyright": footer_copyright, + "footer_note": footer_note, + "header_links": header_links, + "header_dropdown": header_dropdown, + "display_github": True, + "github_user": "mlc-ai", + "github_repo": "web-llm", + "github_version": "main/docs/", + "theme_vcs_pageview_mode": "edit", + # "header_logo": "/path/to/logo", + # "header_logo_link": "", + # "version_selecter": "", +} + + +# add additional overrides +templates_path += [tlcpack_sphinx_addon.get_templates_path()] +html_static_path += [tlcpack_sphinx_addon.get_static_path()] diff --git a/docs/developer/add_models.rst b/docs/developer/add_models.rst new file mode 100644 index 00000000..0a4803d8 --- /dev/null +++ b/docs/developer/add_models.rst @@ -0,0 +1,6 @@ +Adding Models +============= + +WebLLM allows you to compile custom language models using `MLC LLM `_ and then serve compiled model through WebLLM. + +For instructions of how to compile and add custom models to WebLLM, check the `MLC LLM documentation here `_. \ No newline at end of file diff --git a/docs/developer/building_from_source.rst b/docs/developer/building_from_source.rst new file mode 100644 index 00000000..e4508f90 --- /dev/null +++ b/docs/developer/building_from_source.rst @@ -0,0 +1,35 @@ +Building From Source +==================== + +Clone the Repository +--------------------- +.. code-block:: bash + + git clone https://github.com/mlc-ai/web-llm.git + cd web-llm + +Install Dependencies +--------------------- +.. code-block:: bash + + npm install + +Build the Project +----------------- +.. code-block:: bash + + npm run build + +Test Changes +------------ + +To test you changes, you can reuse any existing example or create a new example for your new functionality to test. + +Then, to test the effects of your code change in an example, inside ``examples//package.json``, change from ``"@mlc-ai/web-llm": "^0.2.xx"`` to ``"@mlc-ai/web-llm": ../...`` to let it reference you local code. + +.. code-block:: bash + + cd examples/ + # Modify the package.json + npm install + npm start diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..28b0ed70 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,35 @@ +👋 Welcome to WebLLM +==================== + +`GitHub `_ | `WebLLM Chat `_ | `NPM `_ | `Discord `_ + +WebLLM is a high-performance in-browser language model inference engine that brings large language models (LLMs) to web browsers with hardware acceleration. With WebGPU support, it allows developers to build AI-powered applications directly within the browser environment, removing the need for server-side processing and ensuring privacy. + +It provides a specialized runtime for the web backend of MLCEngine, leverages +`WebGPU `_ for local acceleration, offers OpenAI-compatible API, +and provides built-in support for web workers to separate heavy computation from the UI flow. + +Key Features +------------ +- 🌐 In-Browser Inference: Run LLMs directly in the browser +- 🚀 WebGPU Acceleration: Leverage hardware acceleration for optimal performance +- 🔄 OpenAI API Compatibility: Seamless integration with standard AI workflows +- 📦 Multiple Model Support: Works with Llama, Phi, Gemma, Mistral, and more + +Start exploring WebLLM by `chatting with WebLLM Chat `_, and start building webapps with high-performance local LLM inference with the following guides and tutorials. + +.. toctree:: + :maxdepth: 2 + :caption: User Guide + + user/get_started.rst + user/basic_usage.rst + user/advanced_usage.rst + user/api_reference.rst + +.. toctree:: + :maxdepth: 2 + :caption: Developer Guide + + developer/building_from_source.rst + developer/add_models.rst diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..954237b9 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..2658857d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,8 @@ +sphinx-tabs == 3.4.1 +sphinx-rtd-theme +sphinx == 5.2.3 +sphinx-toolbox == 3.4.0 +tlcpack-sphinx-addon==0.2.2 +sphinxcontrib_httpdomain==1.8.1 +sphinxcontrib-napoleon==0.7 +sphinx-reredirects==0.1.2 diff --git a/docs/user/advanced_usage.rst b/docs/user/advanced_usage.rst new file mode 100644 index 00000000..f5cb034c --- /dev/null +++ b/docs/user/advanced_usage.rst @@ -0,0 +1,133 @@ +Advanced Use Cases +================== + +Using Workers +------------- + +You can put the heavy computation in a worker script to optimize your application performance. To do so, you need to: + +Create a handler in the worker thread that communicates with the frontend while handling the requests. +Create a Worker Engine in your main application, which under the hood sends messages to the handler in the worker thread. +For detailed implementations of different kinds of Workers, check the following sections. + +Using Web Workers +^^^^^^^^^^^^^^^^^ +WebLLM comes with API support for `Web Workers `_ so you can offload the computation-heavy generation work into a separate worker thread. WebLLM has implemented the cross-thread communication through messages under the hood so you don't need to manually implement it any more. + +In the worker script, import and instantiate ``WebWorkerMLCEngineHandler``, which handles the communications with other scripts and processes incoming requests. + +.. code-block:: typescript + + // worker.ts + import { WebWorkerMLCEngineHandler } from "@mlc-ai/web-llm"; + + const handler = new WebWorkerMLCEngineHandler(); + self.onmessage = (msg: MessageEvent) => { + handler.onmessage(msg); + }; + +In the main script, import and instantiate a ``WebWorkerMLCEngine`` that implements the same ``MLCEngineInterface`` and exposes the same APIs, then simply use it as how you would use a normal ``MLCEngine`` in your application. + +.. code-block:: typescript + + import { CreateWebWorkerMLCEngine } from "@mlc-ai/web-llm"; + + async function runWorker() { + const engine = await CreateWebWorkerMLCEngine( + new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }), + "Llama-3.1-8B-Instruct" + ); + + const messages = [{ role: "user", content: "How does WebLLM use workers?" }]; + const reply = await engine.chat.completions.create({ messages }); + console.log(reply.choices[0].message.content); + } + + runWorker(); + + +Under the hood, ``WebWorkerMLCEngine`` does **not** actual doing any computation, but instead serves as a proxy to translate all calls into messages and send to the ``WebWorkerMLCEngineHandler`` to process. The worker thread will receive these messages and process the actual computation using a hidden engine, and return the result back to the main thread using messages. + +Service Workers +^^^^^^^^^^^^^^^ +WebLLM also support offloading the computation in `Service Workers `_ to avoid reloading the model between page refreshes and optimize your application's offline experience. + +(Note, Service Worker's life cycle is managed by the browser and can be killed any time without notifying the webapp. WebLLM's ``ServiceWorkerMLCEngine`` will try to keep the service worker thread alive by periodically sending heartbeat events, but the script could still be killed any time by Chrome and your application should include proper error handling. Check `keepAliveMs` and `missedHeatbeat` in `ServiceWorkerMLCEngine `_ for more details.) + +In the worker script, import and instantiate ``ServiceWorkerMLCEngineHandler``, which handles the communications with page scripts and processes incoming requests. + +.. code-block:: typescript + + // sw.ts + import { ServiceWorkerMLCEngineHandler } from "@mlc-ai/web-llm"; + + self.addEventListener("activate", () => { + const handler = new ServiceWorkerMLCEngineHandler(); + console.log("Service Worker activated!"); + }); + + +Then in the main page script, register the service worker and instantiate the engine using ``CreateServiceWorkerMLCEngine`` factory function. The Engine implements the same ``MLCEngineInterface`` and exposes the same APIs, then simply use it as how you would use a normal ``MLCEngine`` in your application. + +.. code-block:: typescript + + // main.ts + import { MLCEngineInterface, CreateServiceWorkerMLCEngine } from "@mlc-ai/web-llm"; + + if ("serviceWorker" in navigator) { + navigator.serviceWorker.register( + new URL("sw.ts", import.meta.url), // worker script + { type: "module" }, + ); + } + + const engine: MLCEngineInterface = + await CreateServiceWorkerMLCEngine( + selectedModel, + { initProgressCallback }, // engineConfig + ); + +Similar to the ``WebWorkerMLCEngine`` above, the ``ServiceWorkerMLCEngine`` is also a proxy and does not do any actual computation. Instead it sends all calls to the service worker thread to handle and receives the result back through messages. + +Chrome Extension +---------------- + +WebLLM can be used in Chrome extensions to empower local LLM inference. You can find examples of building Chrome extension using WebLLM in `examples/chrome-extension `_ and `examples/chrome-extension-webgpu-service-worker `_. The latter one leverages service worker, so the extension is persistent in the background. + +Additionally, we have a full Chrome extension project, `WebLLM Assistant `_, which leverages WebLLM to provide personal web browsing copilot assistance experience. Free to to check it out and contribute if you are interested. + + +Other Customization +------------------- + +Using IndexedDB Cache +^^^^^^^^^^^^^^^^^^^^^ + +Set `appConfig` in `MLCEngineConfig` to enable caching for faster subsequent model loads. + +.. code-block:: typescript + + const engine = await CreateMLCEngine("Llama-3.1-8B-Instruct", { + appConfig: { + useIndexedDB: true, + models: [ + { model_id: "Llama-3.1-8B", model_path: "/models/llama3" }, + ], + }, + }); + +Customizing Token Behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Modify `logit_bias` in `GenerationConfig` to influence token likelihood: + +.. code-block:: typescript + + const messages = [ + { role: "user", content: "Describe WebLLM in detail." }, + ]; + + const response = await engine.chatCompletion({ + messages, + logit_bias: { "50256": -100 }, // Example: Prevent specific token generation + }); diff --git a/docs/user/api_reference.rst b/docs/user/api_reference.rst new file mode 100644 index 00000000..0a28cf79 --- /dev/null +++ b/docs/user/api_reference.rst @@ -0,0 +1,202 @@ +.. _api-reference: + +WebLLM API Reference +==================== + +The ``MLCEngine`` class is the core interface of WebLLM. It enables model loading, chat completions, embeddings, and other operations. Below, we document its methods, along with the associated configuration interfaces. + +Interfaces +---------- + +The following interfaces are used as parameters or configurations within ``MLCEngine`` methods. They are linked to their respective methods for reference. + +MLCEngineConfig +^^^^^^^^^^^^^^^ + +Optional configurations for ``CreateMLCEngine()`` and ``CreateWebWorkerMLCEngine()``. + + +- **Fields**: + - ``appConfig``: Configure the app, including the list of models and whether to use IndexedDB cache. + - ``initProgressCallback``: A callback for showing the progress of loading the model. + - ``logitProcessorRegistry``: A register for stateful logit processors, see ``webllm.LogitProcessor``. + + +- **Usage**: + - ``appConfig``: Contains application-specific settings, including: + - Model configurations. + - IndexedDB caching preferences. + - ``initProgressCallback``: Allows developers to visualize model loading progress by implementing a callback. + - ``logitProcessorRegistry``: A ``Map`` object for registering custom logit processors. Only applies to ``MLCEngine``. + + +.. note:: All fields are optional, and ``logitProcessorRegistry`` is only used for ``MLCEngine``. + + +Example: + +.. code-block:: typescript + + const engine = await CreateMLCEngine("Llama-3.1-8B-Instruct", { + appConfig: { /* app-specific config */ }, + initProgressCallback: (progress) => console.log(progress), + }); + + +GenerationConfig +^^^^^^^^^^^^^^^^ + +Configurations for a single generation task, primarily used in chat completions. + +- **Fields**: + - ``repetition_penalty``, ``ignore_eos``: Specific to MLC models. + - ``top_p``, ``temperature``, ``max_tokens``, ``stop``: Common with OpenAI APIs. + - ``logit_bias``, ``n``: Additional parameters for sampling control. + +- **Usage**: + - Fields like ``repetition_penalty`` and ``ignore_eos`` allow fine control over the output generation behavior. + - Common parameters shared with OpenAI APIs (e.g., ``temperature``, ``top_p``) ensure compatibility. + + +Example: + +.. code-block:: typescript + + const messages = [ + { role: "system", content: "You are a helpful assistant." }, + { role: "user", content: "Explain WebLLM." }, + ]; + + const response = await engine.chatCompletion({ + messages, + top_p: 0.9, + temperature: 0.8, + max_tokens: 150, + }); + +ChatCompletionRequest +^^^^^^^^^^^^^^^^^^^^^ + +Defines the structure for chat completion requests. + +- **Base Interface**: ``ChatCompletionRequestBase`` + - Contains parameters like ``messages``, ``stream``, ``frequency_penalty``, and ``presence_penalty``. +- **Variants**: + - ``ChatCompletionRequestNonStreaming``: For non-streaming completions. + - ``ChatCompletionRequestStreaming``: For streaming completions. + +- **Usage**: + - Combines settings from ``GenerationConfig`` and ``ChatCompletionRequestBase`` to provide complete control over chat behavior. + - The ``stream`` parameter enables dynamic streaming responses, improving interactivity in conversational agents. + - The ``logit_bias`` feature allows fine-tuning of token generation probabilities, providing a mechanism to restrict or encourage specific outputs. + + +Example: + +.. code-block:: typescript + + const response = await engine.chatCompletion({ + messages: [ + { role: "user", content: "Tell me about WebLLM." }, + ], + stream: true, + }); + +Model Loading +------------- + +``MLCEngine.reload(modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]): Promise`` + +Loads the specified model(s) into the engine. Uses ``MLCEngineConfig`` during initialization. + +- Parameters: + - ``modelId``: Identifier(s) for the model(s) to load. + - ``chatOpts``: Configuration for generation (see ``GenerationConfig``). + +Example: + +.. code-block:: typescript + + await engine.reload(["Llama-3.1-8B", "Gemma-2B"], [ + { temperature: 0.7 }, + { top_p: 0.9 }, + ]); + +``MLCEngine.unload(): Promise`` + +Unloads all loaded models and clears their associated configurations. + +Example: + +.. code-block:: typescript + + await engine.unload(); + +--- + +Chat Completions +---------------- + +``MLCEngine.chat.completions.create(request: ChatCompletionRequest): Promise>`` + +Generates chat-based completions using a specified request configuration. + +- Parameters: + - ``request``: A ``ChatCompletionRequest`` instance. + +Example: + +.. code-block:: typescript + + const response = await engine.chat.completions.create({ + messages: [ + { role: "system", content: "You are a helpful AI assistant." }, + { role: "user", content: "What is WebLLM?" }, + ], + temperature: 0.8, + stream: false, + }); + +--- + +Utility Methods +^^^^^^^^^^^^^^^ + +``MLCEngine.getMessage(modelId?: string): Promise`` + +Retrieves the current output message from the specified model. + +``MLCEngine.resetChat(keepStats?: boolean, modelId?: string): Promise`` + +Resets the chat history and optionally retains usage statistics. + +GPU Information +---------------- + +The following methods provide detailed information about the GPU used for WebLLM computations. + +``MLCEngine.getGPUVendor(): Promise`` + +Retrieves the vendor name of the GPU used for computations. Useful for understanding the hardware capabilities during inference. + +- **Returns**: A string indicating the GPU vendor (e.g., "Intel", "NVIDIA"). + +Example: + +.. code-block:: typescript + + const gpuVendor = await engine.getGPUVendor(); + console.log(``GPU Vendor: ${gpuVendor}``); + +``MLCEngine.getMaxStorageBufferBindingSize(): Promise`` + +Returns the maximum storage buffer size supported by the GPU. This is important when working with larger models that require significant memory for processing. + +- **Returns**: A number representing the maximum size in bytes. + +Example: + +.. code-block:: typescript + + const maxBufferSize = await engine.getMaxStorageBufferBindingSize(); + console.log(``Max Storage Buffer Binding Size: ${maxBufferSize}``); diff --git a/docs/user/basic_usage.rst b/docs/user/basic_usage.rst new file mode 100644 index 00000000..7f77409f --- /dev/null +++ b/docs/user/basic_usage.rst @@ -0,0 +1,120 @@ +Basic Usage +================ + +Model Records in WebLLM +----------------------- + +Each of the model available WebLLM is registered as an instance of +``ModelRecord`` and can be accessed at +`webllm.prebuiltAppConfig.model_list `__. + +Creating an MLCEngine +--------------------- + +WebLLM APIs are exposed through the ``MLCEngine`` interface. You can create an ``MLCEngine`` instance and loading the model by calling the CreateMLCEngine() factory function. + +(Note that loading models requires downloading and it can take a significant amount of time for the very first run without caching previously. You should properly handle this asynchronous call.) + +``MLCEngine`` can be instantiated in two ways: +1. Using the factory function ``CreateMLCEngine``. +2. Instantiating the ``MLCEngine`` class directly and using ``reload()`` to load models. + +.. code-block:: typescript + + import { CreateMLCEngine, MLCEngine } from "@mlc-ai/web-llm"; + + // Initialize with a progress callback + const initProgressCallback = (progress) => { + console.log("Model loading progress:", progress); + }; + + // Using CreateMLCEngine + const engine = await CreateMLCEngine("Llama-3.1-8B-Instruct", { initProgressCallback }); + + // Direct instantiation + const engineInstance = new MLCEngine({ initProgressCallback }); + await engineInstance.reload("Llama-3.1-8B-Instruct"); + +Under the hood, this factory function ``CreateMLCEngine`` does the following steps for first creating an engine instance (synchronous) and then loading the model (asynchronous). You can also do them separately in your application. + +.. code-block:: typescript + + import { MLCEngine } from "@mlc-ai/web-llm"; + + // This is a synchronous call that returns immediately + const engine = new MLCEngine({ + initProgressCallback: initProgressCallback + }); + + // This is an asynchronous call and can take a long time to finish + await engine.reload(selectedModel); + + +Chat Completion +--------------- + +Chat completions can be invoked using OpenAI style chat APIs through the ``engine.chat.completions`` interface of an initialized ``MLCEgnine``. For the full list of parameters and their descriptions, check :ref:`api-reference` for full list of parameters. + +(Note: As model is determined at the ``MLCEngine`` initialization time, ``model`` parameter is not supported and will be **ignored**. Instead, call ``CreateMLCEngine(model)`` or ``engine.reload(model)`` to reinitialize the engine to use a specific model.) + +.. code-block:: typescript + + const messages = [ + { role: "system", content: "You are a helpful AI assistant." }, + { role: "user", content: "Hello!" } + ]; + + const reply = await engine.chat.completions.create({ + messages, + }); + + console.log(reply.choices[0].message); + console.log(reply.usage); + + +Streaming Chat Completion +------------------------- + +Streaming chat completion could be enabled by passsing ``stream: true`` parameter to the `engine.chat.completions.create` call configuration. Check :ref:`api-reference` for full list of parameters. + +.. code-block:: typescript + + const messages = [ + { role: "system", content: "You are a helpful AI assistant." }, + { role: "user", content: "Hello!" }, + ] + + // Chunks is an AsyncGenerator object + const chunks = await engine.chat.completions.create({ + messages, + temperature: 1, + stream: true, // <-- Enable streaming + stream_options: { include_usage: true }, + }); + + let reply = ""; + for await (const chunk of chunks) { + reply += chunk.choices[0]?.delta.content || ""; + console.log(reply); + if (chunk.usage) { + console.log(chunk.usage); // only last chunk has usage + } + } + + const fullReply = await engine.getMessage(); + console.log(fullReply); + + +Chatbot Examples +---------------- + +Learn how to use WebLLM to integrate large language models into your applications and generate chat completions through this simple Chatbot example: + +- `Example in JSFiddle `_ +- `Example in CodePen `_ + +For an advanced example of a larger, more complicated project, check `WebLLM Chat `_. + +More examples for different use cases are available in the examples folder. + + diff --git a/docs/user/get_started.rst b/docs/user/get_started.rst new file mode 100644 index 00000000..1c8613c3 --- /dev/null +++ b/docs/user/get_started.rst @@ -0,0 +1,75 @@ +Getting Started with WebLLM +=========================== + +This guide will help you set up WebLLM in your project, install necessary dependencies, and verify your setup. + + +WebLLM Chat +----------- + +If you want to experience AI Chat supported by local LLM inference and understand how WebLLM works, try out `WebLLM Chat `__, which provides a great example +of integrating WebLLM into a full web application. + +A WebGPU-compatible browser is needed to run WebLLM-powered web applications. +You can download the latest Google Chrome and use `WebGPU Report `__ +to verify the functionality of WebGPU on your browser. + +Installation +------------ + +WebLLM offers a minimalist and modular interface to access the chatbot in the browser. The package is designed in a modular way to hook to any of the UI components. + +WebLLM is available as an `npm package `_ and is also CDN-delivered. Therefore, you can install WebLLM using Node.js pacakage managers like npm, yarn, or pnpm, or directly import the pacakge via CDN. + +Using Package Managers +^^^^^^^^^^^^^^^^^^^^^^ +Install WebLLM via your preferred package manager: + +.. code-block:: bash + + # npm + npm install @mlc-ai/web-llm + # yarn + yarn add @mlc-ai/web-llm + # pnpm + pnpm install @mlc-ai/web-llm + +Import WebLLM into your project: + +.. code-block:: javascript + + // Import everything + import * as webllm from "@mlc-ai/web-llm"; + + // Or only import what you need + import { CreateMLCEngine } from "@mlc-ai/web-llm"; + +Using CDN +^^^^^^^^^ +Thanks to `jsdelivr.com `_, WebLLM can be imported directly through URL and work out-of-the-box on cloud development platforms like `jsfiddle.net `_, `Codepen.io `_, and `Scribbler `_: + +.. code-block:: javascript + + import * as webllm from "https://esm.run/@mlc-ai/web-llm"; + +This method is especially useful for online environments like CodePen, JSFiddle, or local experiments. + +Verifying Installation +^^^^^^^^^^^^^^^^^^^^^^ +Run the following script to verify the installation: + +.. code-block:: javascript + + import { CreateMLCEngine } from "@mlc-ai/web-llm"; + console.log("WebLLM loaded successfully!"); + + +Online IDE Sandbox +------------------ + +Instead of setting WebLLM locally, you can also try it on online Javascript IDE sandboxes like: + +- `Example in JSFiddle `_ +- `Example in CodePen `_ + + diff --git a/examples/abort-reload/package.json b/examples/abort-reload/package.json index 54de6369..3e947766 100644 --- a/examples/abort-reload/package.json +++ b/examples/abort-reload/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/cache-usage/package.json b/examples/cache-usage/package.json index 703582f4..eaa19c26 100644 --- a/examples/cache-usage/package.json +++ b/examples/cache-usage/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/chrome-extension-webgpu-service-worker/package.json b/examples/chrome-extension-webgpu-service-worker/package.json index faed25a7..d6be1496 100644 --- a/examples/chrome-extension-webgpu-service-worker/package.json +++ b/examples/chrome-extension-webgpu-service-worker/package.json @@ -17,7 +17,7 @@ "url": "^0.11.1" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77", + "@mlc-ai/web-llm": "^0.2.78", "progressbar.js": "^1.1.0" } } diff --git a/examples/chrome-extension/package.json b/examples/chrome-extension/package.json index 73cf43ff..dea700dc 100644 --- a/examples/chrome-extension/package.json +++ b/examples/chrome-extension/package.json @@ -17,7 +17,7 @@ "url": "^0.11.1" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77", + "@mlc-ai/web-llm": "^0.2.78", "progressbar.js": "^1.1.0" } } diff --git a/examples/embeddings/package.json b/examples/embeddings/package.json index 096d3ff2..f7039c4d 100644 --- a/examples/embeddings/package.json +++ b/examples/embeddings/package.json @@ -15,7 +15,7 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77", + "@mlc-ai/web-llm": "^0.2.78", "langchain": "0.2.15" } } diff --git a/examples/function-calling/function-calling-manual/package.json b/examples/function-calling/function-calling-manual/package.json index ee759ed6..4033256f 100644 --- a/examples/function-calling/function-calling-manual/package.json +++ b/examples/function-calling/function-calling-manual/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/function-calling/function-calling-openai/package.json b/examples/function-calling/function-calling-openai/package.json index 78574f2f..9003b556 100644 --- a/examples/function-calling/function-calling-openai/package.json +++ b/examples/function-calling/function-calling-openai/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/get-started-web-worker/package.json b/examples/get-started-web-worker/package.json index 263994e4..39b98676 100644 --- a/examples/get-started-web-worker/package.json +++ b/examples/get-started-web-worker/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/get-started/package.json b/examples/get-started/package.json index 6117cfb3..107d989e 100644 --- a/examples/get-started/package.json +++ b/examples/get-started/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/json-mode/package.json b/examples/json-mode/package.json index 57739057..f78ebcb2 100644 --- a/examples/json-mode/package.json +++ b/examples/json-mode/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/json-schema/package.json b/examples/json-schema/package.json index b4a27c59..7a8e76e1 100644 --- a/examples/json-schema/package.json +++ b/examples/json-schema/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/logit-processor/package.json b/examples/logit-processor/package.json index c9804782..0662a5c0 100644 --- a/examples/logit-processor/package.json +++ b/examples/logit-processor/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/multi-models/package.json b/examples/multi-models/package.json index af02919b..94921832 100644 --- a/examples/multi-models/package.json +++ b/examples/multi-models/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/multi-round-chat/package.json b/examples/multi-round-chat/package.json index b811da51..15fdad7a 100644 --- a/examples/multi-round-chat/package.json +++ b/examples/multi-round-chat/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/next-simple-chat/package.json b/examples/next-simple-chat/package.json index f32913cd..50cd940b 100644 --- a/examples/next-simple-chat/package.json +++ b/examples/next-simple-chat/package.json @@ -9,7 +9,7 @@ "lint": "next lint" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77", + "@mlc-ai/web-llm": "^0.2.78", "@types/node": "20.3.3", "@types/react": "18.2.14", "@types/react-dom": "18.2.6", diff --git a/examples/seed-to-reproduce/package.json b/examples/seed-to-reproduce/package.json index f8f2b8bf..4f48d59e 100644 --- a/examples/seed-to-reproduce/package.json +++ b/examples/seed-to-reproduce/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/service-worker/package.json b/examples/service-worker/package.json index 2e5cd6ff..3f0c6edc 100644 --- a/examples/service-worker/package.json +++ b/examples/service-worker/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/simple-chat-ts/package.json b/examples/simple-chat-ts/package.json index 86ad2801..9d529e4e 100644 --- a/examples/simple-chat-ts/package.json +++ b/examples/simple-chat-ts/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/streaming/package.json b/examples/streaming/package.json index 5618c5fa..37f1ce3b 100644 --- a/examples/streaming/package.json +++ b/examples/streaming/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/text-completion/package.json b/examples/text-completion/package.json index 88c36976..e10d8b3a 100644 --- a/examples/text-completion/package.json +++ b/examples/text-completion/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/examples/vision-model/package.json b/examples/vision-model/package.json index e6ba8888..d1e0bfc5 100644 --- a/examples/vision-model/package.json +++ b/examples/vision-model/package.json @@ -15,6 +15,6 @@ "url": "^0.11.3" }, "dependencies": { - "@mlc-ai/web-llm": "^0.2.77" + "@mlc-ai/web-llm": "^0.2.78" } } diff --git a/package-lock.json b/package-lock.json index 419724ef..12e86a22 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mlc-ai/web-llm", - "version": "0.2.77", + "version": "0.2.78", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mlc-ai/web-llm", - "version": "0.2.77", + "version": "0.2.78", "license": "Apache-2.0", "dependencies": { "loglevel": "^1.9.1" diff --git a/package.json b/package.json index f197aab1..a1e2909b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@mlc-ai/web-llm", - "version": "0.2.77", + "version": "0.2.78", "description": "Hardware accelerated language model chats on browsers", "main": "lib/index.js", "types": "lib/index.d.ts", diff --git a/scripts/gh_deploy_site.sh b/scripts/gh_deploy_site.sh index ab6faf55..0baf098d 100755 --- a/scripts/gh_deploy_site.sh +++ b/scripts/gh_deploy_site.sh @@ -1,7 +1,11 @@ #!/bin/bash set -euxo pipefail +export PYTHONPATH=$PWD/python +cd docs && make html && cd .. cd site && jekyll b && cd .. +rm -rf site/_site/docs +cp -r docs/_build/html site/_site/docs git fetch git checkout -B gh-pages origin/gh-pages diff --git a/site/_includes/hero.html b/site/_includes/hero.html index a2c3c314..404bc1d7 100644 --- a/site/_includes/hero.html +++ b/site/_includes/hero.html @@ -2,7 +2,7 @@

WebLLM: High-Performance In-Browser LLM Inference Engine