diff --git a/.buildinfo b/.buildinfo new file mode 100644 index 000000000..a8e20cecc --- /dev/null +++ b/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: b5100acc99d117426e20a5fd4036dd8b +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/_modules/index.html b/_modules/index.html new file mode 100644 index 000000000..54eaddadf --- /dev/null +++ b/_modules/index.html @@ -0,0 +1,138 @@ + + +
+ + +
+from dataclasses import dataclass, field
+from typing import List
+
+
+
+[docs]
+@dataclass
+class CheckpointingInstruction:
+ """
+ Instruction to save and delete checkpoints.
+ """
+
+ save_current: bool = False
+ checkpoints_to_delete: List[int] = field(default_factory=list)
+
+
+from enum import Enum
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from modalities.logging_broker.messages import Message, MessageTypes
+from modalities.logging_broker.subscriber import MessageSubscriberIF
+from typing import Dict, List
+
+
+
+[docs]
+class MessageBrokerIF(ABC):
+ """Interface for message broker objects."""
+
+[docs]
+ @abstractmethod
+ def add_subscriber(self, subscription: MessageTypes, subscriber: MessageSubscriberIF):
+ raise NotImplementedError
+
+
+
+[docs]
+ @abstractmethod
+ def distribute_message(self, message: Message):
+ raise NotImplementedError
+
+
+
+
+
+[docs]
+class MessageBroker(MessageBrokerIF):
+ """The MessageBroker sends notifications to its subscribers."""
+ def __init__(self) -> None:
+ self.subscriptions: Dict[MessageTypes, List[MessageSubscriberIF]] = defaultdict(list)
+
+
+[docs]
+ def add_subscriber(self, subscription: MessageTypes, subscriber: MessageSubscriberIF):
+ """Adds a single subscriber."""
+ self.subscriptions[subscription].append(subscriber)
+
+
+
+[docs]
+ def distribute_message(self, message: Message):
+ """Distributes message to all subscribers."""
+ message_type = message.message_type
+ for subscriber in self.subscriptions[message_type]:
+ subscriber.consume_message(message=message)
+
+
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Generic, TypeVar
+
+
+
+[docs]
+class MessageTypes(Enum):
+ HIGH_LEVEL_PROGRESS_UPDATE = "HIGH_LEVEL_PROGRESS_UPDATE"
+ BATCH_PROGRESS_UPDATE = "PROGRESS_UPDATE"
+ ERROR_MESSAGE = "ERROR_MESSAGE"
+ EVALUATION_RESULT = "EVALUATION_RESULT"
+
+
+
+T = TypeVar("T")
+
+
+
+[docs]
+@dataclass
+class Message(Generic[T]):
+ """An object representing a message."""
+
+ message_type: MessageTypes
+ payload: T
+ global_rank: int = 0
+ local_rank: int = 0
+
+
+
+
+
+
+
+
+[docs]
+@dataclass
+class BatchProgressUpdate:
+ """Object holding the state of the current batch computation progress."""
+
+ global_train_sample_id: int # current sample id in the training dataloader (summed over all ranks).
+ global_dataset_sample_id: int # current sample id in the respective dataloader (summed over all ranks).
+ # Note: in case of ExperimentState.TRAIN, dataset_batch_id=global_train_batch_id
+ experiment_status: ExperimentStatus
+ dataloader_tag: str
+
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+from modalities.logging_broker.message_broker import Message, MessageBroker
+
+from modalities.logging_broker.messages import MessageTypes
+
+T = TypeVar("T")
+
+
+
+[docs]
+class MessagePublisherIF(ABC, Generic[T]):
+
+[docs]
+ @abstractmethod
+ def publish_message(self, payload: T, message_type: MessageTypes):
+ raise NotImplementedError
+
+
+
+
+
+[docs]
+class MessagePublisher(MessagePublisherIF[T]):
+ """The MessagePublisher sends messages through a message broker."""
+ def __init__(
+ self,
+ message_broker: MessageBroker,
+ global_rank: int,
+ local_rank: int,
+ ):
+ self.message_broker = message_broker
+ self.global_rank = global_rank
+ self.local_rank = local_rank
+
+
+[docs]
+ def publish_message(self, payload: T, message_type: MessageTypes):
+ """Publish a message through the message broker."""
+ message = Message[T](
+ message_type=message_type,
+ global_rank=self.global_rank,
+ local_rank=self.local_rank,
+ payload=payload,
+ )
+ self.message_broker.distribute_message(message)
+
+
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+from modalities.logging_broker.messages import Message
+
+T = TypeVar("T")
+
+
+
+
+
+
' + + '' + + _("Hide Search Matches") + + "
" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/api/modalities.checkpointing.html b/api/modalities.checkpointing.html new file mode 100644 index 000000000..fffc05112 --- /dev/null +++ b/api/modalities.checkpointing.html @@ -0,0 +1,214 @@ + + + + + + +BatchProgressUpdate
+ExperimentStatus
+Message
+MessageTypes
+Bases: MessageBrokerIF
The MessageBroker sends notifications to its subscribers.
+Adds a single subscriber.
+subscription (MessageTypes) –
subscriber (MessageSubscriberIF) –
Bases: ABC
Interface for message broker objects.
+subscription (MessageTypes) –
subscriber (MessageSubscriberIF) –
Bases: object
Object holding the state of the current batch computation progress.
+global_train_sample_id (int) –
global_dataset_sample_id (int) –
experiment_status (ExperimentStatus) –
dataloader_tag (str) –
ExperimentStatus
Bases: Enum
An enumeration.
+Bases: Generic
[T
]
An object representing a message.
+message_type (MessageTypes) –
payload (T) –
global_rank (int) –
local_rank (int) –
MessageTypes
Bases: Enum
An enumeration.
+Bases: MessagePublisherIF
[T
]
The MessagePublisher sends messages through a message broker.
+message_broker (MessageBroker) –
global_rank (int) –
local_rank (int) –
Publish a message through the message broker.
+payload (T) –
message_type (MessageTypes) –
EDIT “docs/source/benchmarking.rst” IN ORDER TO MAKE CHANGES HERE
+Training config is defined in yaml formatted files. See data/config_lorem_ipsum.yaml
. These configs are very explicit specifying all training parameters to keep model trainings as transparent and reproducible as possible. Each config setting is reflected in pydantic classes in src/modalities/config/*.py
. In the config you need to define which config classes to load in field type_hint. This specifies the concrete class. A second parameter, config, then takes all the constructor arguments for that config class. This way it is easy to change i.e. DataLoaders while still having input validation in place.
The mechanismn introduced to instantiate classes via type_hint
in the config.yaml
, utilizes
Omegaconf to load the config yaml file
Pydantic for the validation of the config
ClassResolver to instantiate the correct, concrete class of a class hierarchy.
Firstly, Omegaconf loads the config yaml file and resolves internal refrences such as ${subconfig.attribue}.
+Then, Pydantic validates the whole config as is and checks that each of the sub-configs are pydantic.BaseModel
classes.
+For configs, which allow different concrete classes to be instantiated by ClassResolver
, the special member names type_hint
and config
are introduced.
+With this we utilize Pydantics feature to auto-select a fitting type based on the keys in the config yaml file.
ClassResolver
replaces large if-else control structures to infer the correct concrete type with a type_hint
used for correct class selection:
activation_resolver = ClassResolver(
+ [nn.ReLU, nn.Tanh, nn.Hardtanh],
+ base=nn.Module,
+ default=nn.ReLU,
+)
+type_hint="ReLU"
+activation_kwargs={...}
+activation_resolver.make(type_hint, activation_kwargs),
+
In our implmentation we go a step further, as both,
+a type_hint
in a BaseModel
config must be of type modalities.config.lookup_types.LookupEnum
and
config
is a union of allowed concrete configs of base type BaseModel
.
config
hereby replaces activation_kwargs
in the example above, and replaces it with pydantic-validated BaseModel
configs.
With this, a mapping between type hint strings needed for class-resolver, and the concrete class is introduced, while allowing pydantic to select the correct concrete config:
+from enum import Enum
+from pydantic import BaseModel, PositiveInt, PositiveFloat, conint, confloat
+
+class LookupEnum(Enum):
+ @classmethod
+ def _missing_(cls, value: str) -> type:
+ """constructs Enum by member name, if not constructable by value"""
+ return cls.__dict__[value]
+
+class SchedulerTypes(LookupEnum):
+ StepLR = torch.optim.lr_scheduler.StepLR
+ ConstantLR = torch.optim.lr_scheduler.ConstantLR
+
+class StepLRConfig(BaseModel):
+ step_size: conint(ge=1)
+ gamma: confloat(ge=0.0)
+
+
+class ConstantLRConfig(BaseModel):
+ factor: PositiveFloat
+ total_iters: PositiveInt
+
+
+class SchedulerConfig(BaseModel):
+ type_hint: SchedulerTypes
+ config: StepLRConfig | ConstantLRConfig
+
To allow a user-friendly instantiation, all class resolvers are defined in the ResolverRegistry
and build_component_by_config
as convenience function is introduced. Dependecies can be passed-through with the extra_kwargs
argument:
resolvers = ResolverRegister(config=config)
+optimizer = ... # our example dependency
+scheduler = resolvers.build_component_by_config(config=config.scheduler, extra_kwargs=dict(optimizer=optimizer))
+
To add a new resolver use add_resolver
, and the corresponding added resolver will be accessible by the register_key given during adding.
For access use the build_component_by_key_query
function of the ResolverRegistry
.
We use click as a tool to add new entry points and their CLI arguments. +For this we have a main entry point from which all other entry points are started.
+The main entry point is src/modalities/__main__.py:main()
.
+We register other sub-entrypoints by using our main click.group
, called main
, as follows:
@main.command(name="my_new_entry_point")
+
See the following full example:
+import click
+import click_pathlib
+
+
+@click.group()
+def main() -> None:
+ pass
+
+
+config_option = click.option(
+ "--config_file_path",
+ type=click_pathlib.Path(exists=False),
+ required=True,
+ help="Path to a file with the YAML config file.",
+)
+
+
+@main.command(name="do_stuff")
+@config_option
+@click.option(
+ "--my_cli_argument",
+ type=int,
+ required=True,
+ help="New integer argument",
+)
+def entry_point_do_stuff(config_file_path: Path, my_cli_argument: int):
+ print(f"Do stuff with {config_file_path} and {my_cli_argument}...)
+ ...
+
+if __name__ == "__main__":
+ main()
+
With
+[project.scripts]
+modalities = "modalities.__main__:main"
+
in our pyproject.toml
, we can start only main with modalities
(which does nothing), or a specific sub-entrypoint e.g. modalities do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537
.
Alternatively, directly use src/modalities/__main__.py do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537
.
The team is currently working on our already established LLM code base to bring in multi-modality into the mix. This extension will be based on ideas similar to CoCa and/or AudioPaLM, which would enable users to either use different encoders for different modalities in conjunction with a text-based decoder, or use a decoder-only architecture. +Future modalities other than text can be used, namely,
+image
audio
video
+ |
+ | + |
+ | + |
+ | + |
+ | + |
+ | + |
+ |
+ |
+ |
+ | + |
|
+
+ |
|
+ + |
We propose a novel training framework for Multimodal Large Language Models (LLMs) that prioritizes code readability and efficiency. +The codebase adheres to the principles of “clean code,” minimizing Lines of Code (LoC) while maintaining extensibility. +A single, comprehensive configuration file enables easy customization of various model and training parameters.
+A key innovation is the adoption of a PyTorch-native training loop integrated with the Fully Sharded Data Parallelism (FSDP) technique. +FSDP optimizes memory usage and training speed, enhancing scalability for large-scale multimodal models. +By leveraging PyTorch’s native capabilities, our framework simplifies the development process and promotes ease of maintenance.
+The framework’s modular design facilitates experimentation with different multimodal architectures and training strategies. +Users can seamlessly integrate diverse datasets and model components, allowing for comprehensive exploration of multimodal learning tasks. +The combination of clean code, minimal configuration, and PyTorch-native training with FSDP contributes to a user-friendly and efficient platform for developing state-of-the-art multimodal language models.
+Note
+This project is under active development.
+BatchProgressUpdate
+ExperimentStatus
+Message
+MessageTypes
+hardcoded dataset path /raid/s3/opengptx/mehdi/temp/temp_data/train_text_document.bin
in config/config.yaml
Dependency on weights&biases
The MemMapDataset
requires an index file providing the necessary pointers into the raw data file. The MemMapDataset
can create the index file lazily, however, it is advised to create it beforehand. This can be done by running
modalities create_memmap_index <path/to/jsonl/file>
+
The index will be created in the same directory as the raw data file. For further options you may look into the usage documentation via modalities create_memmap_index --help
.
The PackedMemMapDatasetContinuous
and PackedMemMapDatasetMegatron
require a packed data file. To create the data file, you first have to generate a MemMapDataset
index file as described above. Assuming the index and raw data are located in the same directory, you can simply execute the following command:
modalities create_packed_data <path/to/jsonl/file>
+
The packed data file will be created in the same directory as the raw data file. For further options you may look into the usage documentation via modalities create_packed_data --help
.
The packed data file is a bytestream containing both the tokenized data as well as an index denoting the start and length of the tokenized documents inside the bytestream. The data file consists of 3 concatenated parts:
+header segment | data segment | index segment
+header segment: This section is a 8 bytes sized integer which encodes the length of the data segment in bytes.
data segment: This section contains a concatenation of all documents in form of 4 bytes sized tokens. An end-of-sequence token is placed between consecutive documents.
index segment: This section contains a pickled index which locates the documents inside the data segment. The index is basically a list of tuples, where each tuple contains the start position and length in bytes for the corresponding document, e.g., [(start_doc1, len_doc1), (start_doc2, len_doc2), ....]
.
<TODO>
++ m | ||
+ |
+ modalities | + |
+ |
+ modalities.checkpointing | + |
+ |
+ modalities.checkpointing.checkpointing_instruction | + |
+ |
+ modalities.config | + |
+ |
+ modalities.config.types | + |
+ |
+ modalities.dataloader | + |
+ |
+ modalities.dataloader.open_gptx_dataset | + |
+ |
+ modalities.exceptions | + |
+ |
+ modalities.logging_broker | + |
+ |
+ modalities.logging_broker.message_broker | + |
+ |
+ modalities.logging_broker.messages | + |
+ |
+ modalities.logging_broker.publisher | + |
+ |
+ modalities.logging_broker.subscriber | + |
+ |
+ modalities.logging_broker.subscriber_impl | + |
+ |
+ modalities.models | + |
+ |
+ modalities.models.gpt2 | + |
+ |
+ modalities.running_env | + |
+ |
+ modalities.running_env.fsdp | + |
Setup a conda environment conda create -n modalities python=3.10 & conda activate modalities and install the requirements pip install -e ..
+To start a training you need to create memmap dataset out of a jsonl file first, then pack it, then run the training.
+# Create memmap dataset from jsonl file.
+modalities create_memmap_index <path/to/jsonl/file>
+
+# Create packed dataset.
+modalities create_packed_data <path/to/jsonl/file>
+
For example, using the lorem ipsum example:
+# Create memmap dataset from jsonl file.
+modalities create_memmap_index data/lorem_ipsum.jsonl
+
+# Create packed dataset.
+modalities create_packed_data data/lorem_ipsum.jsonl
+
To run a training environment variables in a multi-gpu setting are required.
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nnodes 1 --nproc_per_node 2 --rdzv-endpoint=0.0.0.0:29502 src/modalities/__main__.py run --config_file_path config_files/config_lorem_ipsum.yaml
+
Evaluation: +WIP add contents
We recommend a docker environment based on the most recent pytorch e.g.:
+FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel
+RUN apt-get update && apt-get install -y wget openssh-client git-core bash-completion
+RUN wget -O /tmp/git-lfs.deb https://packagecloud.io/github/git-lfs/packages/ubuntu/focal/git-lfs_2.13.3_amd64.deb/download.deb && \
+ dpkg -i /tmp/git-lfs.deb && \
+ rm /tmp/git-lfs.deb
+RUN echo 'source /usr/share/bash-completion/completions/git' >> ~/.bashrc
+CMD ["/bin/bash"]
+
This works seamlessly in combination with the VSCode DevContainer extention:
+{
+ "name": "Dev Container",
+ "dockerFile": "Dockerfile",
+ "runArgs": [
+ "--network",
+ "host",
+ "--gpus",
+ "all"
+ ],
+ "customizations": {
+ "vscode": {
+ "settings": {
+ "terminal.integrated.shell.linux": "/bin/bash"
+ },
+ "extensions": [
+ "ms-python.python"
+ ]
+ }
+ }
+}
+
In VSCode, add this to your launch.json
:
{
+ "name": "Torchrun Train and Eval",
+ "type": "python",
+ "request": "launch",
+ "module": "torch.distributed.run",
+ "env": {
+ "CUDA_VISIBLE_DEVICES": "4,5"
+ },
+ "args": [
+ "--nnodes",
+ "1",
+ "--nproc_per_node",
+ "2",
+ "--rdzv-endpoint=0.0.0.0:29503",
+ "src/modalities/__main__.py",
+ "run",
+ "--config_file_path",
+ "config_files/config_lorem_ipsum.yaml",
+ ],
+ "console": "integratedTerminal",
+ "justMyCode": true,
+ "envFile": "${workspaceFolder}/.env",
+ "cwd": "${workspaceFolder}/modalities"
+}
+