From 274ba66e9a99d03f8edc886b32e986000548d16f Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Mon, 22 Apr 2024 22:18:39 +0200 Subject: [PATCH 1/2] fix: avoid pydantic warnings in CLI help message --- src/modalities/models/huggingface/huggingface_models.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/modalities/models/huggingface/huggingface_models.py b/src/modalities/models/huggingface/huggingface_models.py index 4c66d46f..b980e671 100644 --- a/src/modalities/models/huggingface/huggingface_models.py +++ b/src/modalities/models/huggingface/huggingface_models.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional import torch -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoTokenizer from modalities.config.lookup_enum import LookupEnum @@ -35,6 +35,10 @@ class HuggingFacePretrainedModelConfig(BaseModel): model_args: Optional[Any] = None kwargs: Optional[Any] = None + # avoid warning about protected namespace 'model_', see + # https://docs.pydantic.dev/2.7/api/config/#pydantic.config.ConfigDict.protected_namespaces + model_config = ConfigDict(protected_namespaces=()) + class HuggingFacePretrainedModel(NNModel): def __init__( From c7d828cb07d0e580da70bb5beea8bfcb51ed38f4 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Mon, 22 Apr 2024 22:20:43 +0200 Subject: [PATCH 2/2] fix: linting --- src/modalities/tokenization/tokenizer_wrapper.py | 10 ++++------ tests/dataloader/test_packed_dataset.py | 9 ++++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/modalities/tokenization/tokenizer_wrapper.py b/src/modalities/tokenization/tokenizer_wrapper.py index bb1edd42..1beeb9a7 100644 --- a/src/modalities/tokenization/tokenizer_wrapper.py +++ b/src/modalities/tokenization/tokenizer_wrapper.py @@ -11,12 +11,11 @@ def tokenize(self, text: str) -> List[int]: @property def vocab_size(self) -> int: raise NotImplementedError("Tokenizer must be implemented by a subclass.") - + def get_token_id(self, token: str) -> int: raise NotImplementedError - class PreTrainedHFTokenizer(TokenizerWrapper): def __init__( self, pretrained_model_name_or_path: str, max_length: int, truncation: bool = True, padding: str = "max_length" @@ -38,7 +37,7 @@ def tokenize(self, text: str) -> List[int]: truncation=self.truncation, )["input_ids"] return tokens - + def get_token_id(self, token: str) -> int: token_id = self.tokenizer.convert_tokens_to_ids(token) if isinstance(token_id, list): @@ -53,16 +52,15 @@ def __init__(self, tokenizer_model_file: str): pass def tokenize(self, text: str) -> List[int]: - tokens = self.tokenizer.encode(text) + tokens = self.tokenizer.encode(text) return tokens @property def vocab_size(self) -> int: return self.tokenizer.vocab_size() - + def get_token_id(self, token: str) -> int: piece_id = self.tokenizer.PieceToId(token) if piece_id == self.tokenizer.unk_id(): raise ValueError("Token is not represented by a single token id!") return piece_id - diff --git a/tests/dataloader/test_packed_dataset.py b/tests/dataloader/test_packed_dataset.py index 5c3ba94f..090e844b 100644 --- a/tests/dataloader/test_packed_dataset.py +++ b/tests/dataloader/test_packed_dataset.py @@ -6,7 +6,6 @@ from modalities.dataloader.create_packed_data import EmbeddedStreamData, PackedDataGenerator, join_embedded_stream_data from modalities.dataloader.dataset import PackedMemMapDatasetContinuous, PackedMemMapDatasetMegatron from modalities.models.gpt2.collator import GPT2LLMCollateFn -from modalities.tokenization.tokenizer_wrapper import TokenizerWrapper @pytest.mark.parametrize("block_size, expected_length", [(1, 4), (2, 3), (3, 3), (10, 2), (6, 2), (20, 1), (25, 0)]) @@ -43,8 +42,12 @@ def test_packed_continuous_dataset_missing_file(dummy_packed_data_path): def test_create_packed_dataset(indexed_dummy_data_path, wrapped_gpt2_tokenizer): block_size = 5 packed_generator = PackedDataGenerator( - src_path=indexed_dummy_data_path.raw_data_path, tokenizer=wrapped_gpt2_tokenizer, number_of_processes=2, eod_token="<|endoftext|>", - index_path=indexed_dummy_data_path.index_path, jq_pattern=".text" + src_path=indexed_dummy_data_path.raw_data_path, + tokenizer=wrapped_gpt2_tokenizer, + number_of_processes=2, + eod_token="<|endoftext|>", + index_path=indexed_dummy_data_path.index_path, + jq_pattern=".text", ) default_packed_dataset_path = packed_generator._default_destination_path() assert not default_packed_dataset_path.is_file()