Skip to content

Commit

Permalink
Merge pull request #152 from Modalities/tokenizer_remove_max_length_flag
Browse files Browse the repository at this point in the history
Improved the huggingface tokenizer integration:
* `max_length`, `truncation`, `padding` and. `special_tokens` are now fully configurable
* added extensive testing w.r.t. the four variables
  • Loading branch information
le1nux committed Jun 14, 2024
2 parents 31ff5c6 + c4f65f6 commit ed3fb62
Show file tree
Hide file tree
Showing 14 changed files with 366 additions and 92 deletions.
2 changes: 1 addition & 1 deletion config_files/data_preparation/packed_cc_en_2048.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ tokenizer:
config:
tokenizer_model_file: /workspaces/modalities/data/tokenizer/sp_bpe_en/bpe_tokenizer.model
padding: false
max_length: 2048
truncation: false
2 changes: 1 addition & 1 deletion config_files/data_preparation/packed_dataset_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ tokenizer:
config:
pretrained_model_name_or_path: data/tokenizer/hf_gpt2
padding: false
max_length: 512
truncation: false
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,4 @@ tokenizer:
config:
pretrained_model_name_or_path: /raid/s3/opengptx/max_lue/modalities/data/tokenizer/hf_gpt2
padding: false
max_length: ${settings.context_length}
truncation: false
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,4 @@ tokenizer:
config:
pretrained_model_name_or_path: /raid/s3/opengptx/max_lue/modalities/data/tokenizer/hf_gpt2
padding: false
max_length: ${settings.context_length}
truncation: false
2 changes: 1 addition & 1 deletion examples/getting_started/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ tokenizer:
config:
pretrained_model_name_or_path: tokenizer
padding: false
max_length: 512
truncation: false
```

### Step 1: Create Index
Expand Down
2 changes: 1 addition & 1 deletion examples/getting_started/example_dataset_config_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ tokenizer:
config:
pretrained_model_name_or_path: tokenizer
padding: false
max_length: 512
truncation: false
2 changes: 1 addition & 1 deletion examples/getting_started/example_dataset_config_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ tokenizer:
config:
pretrained_model_name_or_path: tokenizer
padding: false
max_length: 512
truncation: false
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,4 @@ tokenizer:
config:
pretrained_model_name_or_path: tokenizer
padding: false
max_length: ${settings.context_length}
truncation: false
2 changes: 1 addition & 1 deletion notebooks/components.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ tokenizer:
config:
tokenizer_model_file: /workspaces/modalities/notebooks/tokenizer/unigram_tokenizer.model
padding: false
max_length: 2048
truncation: false

train_dataset:
component_key: dataset
Expand Down
38 changes: 30 additions & 8 deletions src/modalities/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,14 @@

import torch
from omegaconf import OmegaConf
from pydantic import BaseModel, Field, FilePath, PositiveInt, field_validator, model_validator
from pydantic import (
BaseModel,
Field,
FilePath,
PositiveInt,
field_validator,
model_validator,
)
from torch.distributed.fsdp import ShardingStrategy
from transformers import GPT2TokenizerFast
from transformers.models.llama.tokenization_llama_fast import LlamaTokenizerFast
Expand Down Expand Up @@ -146,7 +153,9 @@ class StepLRSchedulerConfig(BaseModel):

class OneCycleLRSchedulerConfig(BaseModel):
optimizer: PydanticOptimizerIFType
max_lr: Annotated[float, Field(strict=True, gt=0.0)] | List[Annotated[float, Field(strict=True, gt=0.0)]]
max_lr: Annotated[float, Field(strict=True, gt=0.0)] | List[
Annotated[float, Field(strict=True, gt=0.0)]
]
total_steps: Optional[Annotated[int, Field(strict=True, gt=0)]] = None
epochs: Optional[Annotated[int, Field(strict=True, gt=0)]] = None
steps_per_epoch: Optional[Annotated[int, Field(strict=True, gt=0)]] = None
Expand All @@ -167,8 +176,12 @@ class OneCycleLRSchedulerConfig(BaseModel):

@model_validator(mode="after")
def check_totals_steps_and_epchs(self) -> "OneCycleLRSchedulerConfig":
if self.total_steps is None and (self.epochs is None or self.steps_per_epoch is None):
raise ValueError("Please define total_steps or (epochs and steps_per_epoch).")
if self.total_steps is None and (
self.epochs is None or self.steps_per_epoch is None
):
raise ValueError(
"Please define total_steps or (epochs and steps_per_epoch)."
)
return self


Expand Down Expand Up @@ -227,9 +240,10 @@ def parse_sharding_strategy_by_name(cls, name):

class PreTrainedHFTokenizerConfig(BaseModel):
pretrained_model_name_or_path: str
max_length: Annotated[int, Field(strict=True, ge=0)]
max_length: Optional[Annotated[int, Field(strict=True, ge=0)]] = None
truncation: bool = False
padding: bool | str = False
special_tokens: Optional[Dict[str, str]] = None


class PreTrainedSPTokenizerConfig(BaseModel):
Expand Down Expand Up @@ -316,7 +330,9 @@ class DummyProgressSubscriberConfig(BaseModel):

class RichProgressSubscriberConfig(BaseModel):
train_dataloader: PydanticLLMDataLoaderIFType
eval_dataloaders: Optional[List[PydanticLLMDataLoaderIFType]] = Field(default_factory=list)
eval_dataloaders: Optional[List[PydanticLLMDataLoaderIFType]] = Field(
default_factory=list
)
global_num_seen_steps: int
local_rank: int

Expand All @@ -342,7 +358,11 @@ class RichResultSubscriberConfig(BaseModel):
def load_app_config_dict(config_file_path: Path) -> Dict:
def cuda_env_resolver_fun(var_name: str) -> int:
int_env_variable_names = ["LOCAL_RANK", "WORLD_SIZE", "RANK"]
return int(os.getenv(var_name)) if var_name in int_env_variable_names else os.getenv(var_name)
return (
int(os.getenv(var_name))
if var_name in int_env_variable_names
else os.getenv(var_name)
)

def modalities_env_resolver_fun(var_name: str) -> int:
if var_name == "experiment_id":
Expand All @@ -355,7 +375,9 @@ def node_env_resolver_fun(var_name: str) -> int:
return os.cpu_count()

OmegaConf.register_new_resolver("cuda_env", cuda_env_resolver_fun, replace=True)
OmegaConf.register_new_resolver("modalities_env", modalities_env_resolver_fun, replace=True)
OmegaConf.register_new_resolver(
"modalities_env", modalities_env_resolver_fun, replace=True
)
OmegaConf.register_new_resolver("node_env", node_env_resolver_fun, replace=True)

cfg = OmegaConf.load(config_file_path)
Expand Down
59 changes: 0 additions & 59 deletions src/modalities/models/gpt2/preprocess_dataset.py

This file was deleted.

27 changes: 24 additions & 3 deletions src/modalities/tokenization/tokenizer_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC
from typing import List
from typing import Dict, List, Optional

import sentencepiece as spm
from transformers import AutoTokenizer
Expand All @@ -25,9 +25,26 @@ def get_token_id(self, token: str) -> int:

class PreTrainedHFTokenizer(TokenizerWrapper):
def __init__(
self, pretrained_model_name_or_path: str, max_length: int, truncation: bool = True, padding: str = "max_length"
self,
pretrained_model_name_or_path: str,
truncation: bool = False,
padding: bool | str = False,
max_length: Optional[int] = None,
special_tokens: Optional[Dict[str, str]] = None,
) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
# also see here for the truncation and padding options and their effects:
# https://huggingface.co/docs/transformers/pad_truncation#padding-and-truncation

self.tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path
)
if special_tokens is not None:
# TODO check if we always want to set
# replace_additional_special_tokens=False
self.tokenizer.add_special_tokens(
special_tokens_dict=special_tokens,
replace_additional_special_tokens=False,
)
self.max_length = max_length
self.truncation = truncation
self.padding = padding
Expand All @@ -36,6 +53,10 @@ def __init__(
def vocab_size(self):
return self.tokenizer.vocab_size

@property
def special_tokens(self) -> Dict[str, str | List[str]]:
return self.tokenizer.special_tokens_map

def tokenize(self, text: str) -> List[int]:
tokens = self.tokenizer.__call__(
text,
Expand Down
Loading

0 comments on commit ed3fb62

Please sign in to comment.