Skip to content

Commit

Permalink
Merge pull request #190 from Modalities/fix_log_only_with_global_rank_0
Browse files Browse the repository at this point in the history
Fix log only with global rank 0
  • Loading branch information
fromm-m authored Jul 17, 2024
2 parents e42ecc9 + aa73dce commit 15ed069
Show file tree
Hide file tree
Showing 11 changed files with 24 additions and 29 deletions.
4 changes: 2 additions & 2 deletions config_files/training/config_example_coca.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
Expand All @@ -293,7 +293,7 @@ evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
project: modalities
mode: OFFLINE
experiment_id: ${settings.experiment_id}
Expand Down
4 changes: 2 additions & 2 deletions config_files/training/config_lorem_ipsum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
Expand All @@ -304,7 +304,7 @@ evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
project: modalities_lorem_ipsum
mode: ONLINE
experiment_id: ${settings.experiment_id}
Expand Down
4 changes: 2 additions & 2 deletions examples/library_usage/config_lorem_ipsum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
Expand All @@ -308,7 +308,7 @@ evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
project: modalities
mode: OFFLINE
experiment_id: ${settings.experiment_id}
Expand Down
3 changes: 1 addition & 2 deletions src/modalities/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def run(self, components: TrainingComponentsInstantiationModel):
* components.settings.cuda_env.world_size
)
trainer = Trainer(
local_rank=components.settings.cuda_env.local_rank,
global_rank=components.settings.cuda_env.global_rank,
batch_progress_publisher=batch_processed_publisher,
evaluation_result_publisher=evaluation_result_publisher,
gradient_acc_steps=components.settings.training.gradient_acc_steps,
Expand All @@ -242,7 +242,6 @@ def run(self, components: TrainingComponentsInstantiationModel):

# Evaluator
evaluator = Evaluator(
local_rank=components.settings.cuda_env.local_rank,
batch_progress_publisher=batch_processed_publisher,
evaluation_result_publisher=evaluation_result_publisher,
)
Expand Down
6 changes: 3 additions & 3 deletions src/modalities/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ class RichProgressSubscriberConfig(BaseModel):
train_dataloader: PydanticLLMDataLoaderIFType
eval_dataloaders: Optional[List[PydanticLLMDataLoaderIFType]] = Field(default_factory=list)
global_num_seen_steps: int
local_rank: int
global_rank: int
gradient_acc_steps: Annotated[int, Field(strict=True, gt=0)]


Expand All @@ -339,7 +339,7 @@ class DummyResultSubscriberConfig(BaseModel):


class WandBEvaluationResultSubscriberConfig(BaseModel):
local_rank: int
global_rank: int
project: str
experiment_id: str
mode: WandbMode
Expand All @@ -349,7 +349,7 @@ class WandBEvaluationResultSubscriberConfig(BaseModel):

class RichResultSubscriberConfig(BaseModel):
num_ranks: int
local_rank: int
global_rank: int


def load_app_config_dict(config_file_path: Path) -> Dict:
Expand Down
4 changes: 1 addition & 3 deletions src/modalities/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@
class Evaluator:
def __init__(
self,
local_rank: int,
batch_progress_publisher: MessagePublisher[BatchProgressUpdate],
evaluation_result_publisher: MessagePublisher[EvaluationResultBatch],
) -> None:
self.local_rank = local_rank
self.batch_progress_publisher = batch_progress_publisher
self.evaluation_result_publisher = evaluation_result_publisher

Expand All @@ -46,7 +44,7 @@ def evaluate(
result_dict: Dict[str, EvaluationResultBatch] = {}
model.eval()

device = torch.device(self.local_rank if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for data_loader in data_loaders:
cumulated_loss = torch.zeros(3).to(device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ def get_rich_progress_subscriber(
train_dataloader: LLMDataLoader,
eval_dataloaders: List[LLMDataLoader],
global_num_seen_steps: int,
local_rank: int,
global_rank: int,
gradient_acc_steps: int,
) -> RichProgressSubscriber:
if local_rank == 0:
if global_rank == 0:
train_split_num_steps = {
# first element describes the total number of steps
# and the second element describes the number of steps already completed
Expand All @@ -47,8 +47,8 @@ def get_dummy_progress_subscriber() -> DummyProgressSubscriber:

class ResultsSubscriberFactory:
@staticmethod
def get_rich_result_subscriber(num_ranks: int, local_rank: int) -> RichResultSubscriber:
if local_rank == 0:
def get_rich_result_subscriber(num_ranks: int, global_rank: int) -> RichResultSubscriber:
if global_rank == 0:
return RichResultSubscriber(num_ranks)
else:
return ResultsSubscriberFactory.get_dummy_result_subscriber()
Expand All @@ -59,14 +59,14 @@ def get_dummy_result_subscriber() -> DummyResultSubscriber:

@staticmethod
def get_wandb_result_subscriber(
local_rank: int,
global_rank: int,
project: str,
experiment_id: str,
mode: WandbMode,
config_file_path: Path,
directory: Path = None,
) -> WandBEvaluationResultSubscriber:
if local_rank == 0 and (mode != WandbMode.DISABLED):
if global_rank == 0 and (mode != WandbMode.DISABLED):
result_subscriber = WandBEvaluationResultSubscriber(
project, experiment_id, mode, directory, config_file_path
)
Expand Down
10 changes: 5 additions & 5 deletions src/modalities/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ class ThroughputAggregationKeys(Enum):
class Trainer:
def __init__(
self,
local_rank: int,
global_rank: int,
batch_progress_publisher: MessagePublisher[BatchProgressUpdate],
evaluation_result_publisher: MessagePublisher[EvaluationResultBatch],
gradient_acc_steps: int,
global_num_tokens_per_train_step: int,
gradient_clipper: GradientClipperIF,
) -> None:
self.local_rank = local_rank
self.global_rank = global_rank
self.batch_progress_publisher = batch_progress_publisher
self.evaluation_result_publisher = evaluation_result_publisher
self.gradient_acc_steps = gradient_acc_steps
Expand Down Expand Up @@ -89,7 +89,7 @@ def train(

thoughput_aggregator = Aggregator[ThroughputAggregationKeys]()

device = torch.device(self.local_rank if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# batch loop
batch: DatasetBatch
Expand Down Expand Up @@ -194,7 +194,7 @@ def train(
dataloader_tag=train_loader.dataloader_tag,
num_train_steps_done=num_train_steps_done,
)
if self.local_rank == 0:
if self.global_rank == 0:
print(training_metrics)
self._publish_evaluation_result(
evaluation_result_publisher=self.evaluation_result_publisher,
Expand All @@ -215,7 +215,7 @@ def _reset_tracked_losses(self):
# summed lcoal losses, loss of last local batch, number of local batches (i.e., number of steps)
cumulated_loss_and_gradient_norm = torch.zeros(3)
if torch.cuda.is_available():
cumulated_loss_and_gradient_norm = cumulated_loss_and_gradient_norm.to(torch.device(self.local_rank))
cumulated_loss_and_gradient_norm = cumulated_loss_and_gradient_norm.to(torch.device("cuda"))
else:
cumulated_loss_and_gradient_norm = cumulated_loss_and_gradient_norm.to("cpu")
return cumulated_loss_and_gradient_norm
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def progress_publisher_mock():
@pytest.fixture(scope="function")
def trainer(progress_publisher_mock, gradient_clipper_mock):
return Trainer(
local_rank=int(os.getenv("LOCAL_RANK")),
global_rank=int(os.getenv("RANK")),
batch_progress_publisher=progress_publisher_mock,
evaluation_result_publisher=progress_publisher_mock,
gradient_acc_steps=1,
Expand Down
2 changes: 0 additions & 2 deletions tests/test_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from unittest.mock import call

import torch
Expand Down Expand Up @@ -26,7 +25,6 @@ def test_evaluate_cpu(
llm_data_loader_mock.batch_size = batch_size

evaluator = Evaluator(
local_rank=int(os.getenv("LOCAL_RANK")),
batch_progress_publisher=progress_publisher_mock,
evaluation_result_publisher=progress_publisher_mock,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_yaml_configs/config_lorem_ipsum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
Expand Down

0 comments on commit 15ed069

Please sign in to comment.