Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

processors support aistudio #330

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion paddlemix/models/blip2/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def init_tokenizer(cls, tokenizer_name="bert-base-uncased"):
@classmethod
def refine_state_dict(self, model, state_dict):
from paddlemix.models.blip2.eva_vit import interpolate_pos_embed

interpolate_pos_embed(model, state_dict)

def get_expected_keys(self, model_state_dict, name=None):
Expand Down Expand Up @@ -203,6 +204,7 @@ def from_pretrained(
subfolder = kwargs.pop("subfolder", "")
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
from_aistudio = kwargs.get("from_aistudio", False)

low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
convert_from_torch = kwargs.pop("convert_from_torch", None)
Expand Down Expand Up @@ -269,6 +271,7 @@ def from_pretrained(
cache_dir=cache_dir,
subfolder=subfolder,
from_hf_hub=from_hf_hub,
from_aistudio=from_aistudio,
config=config,
convert_from_torch=convert_from_torch,
use_safetensors=use_safetensors,
Expand Down Expand Up @@ -322,7 +325,9 @@ def from_pretrained(
init_args = config["init_args"] or ()
with ContextManagers(init_contexts):
model = cls(config, *init_args, **model_kwargs)
cls.refine_state_dict(model, state_dict)
if state_dict is not None:
cls.refine_state_dict(model, state_dict)

if use_keep_in_fp32_modules:
# low_cpu_mem_usage = True
keep_in_fp32_modules = model._keep_in_fp32_modules
Expand Down
10 changes: 5 additions & 5 deletions paddlemix/models/blip2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,16 @@ def load_real_time_tokens():
return tokens


def create_tokenizer(text_model_name_or_path):
def create_tokenizer(text_model_name_or_path, **kwags):
if "opt" in text_model_name_or_path:
tokenizer_class = AutoTokenizer.from_pretrained(text_model_name_or_path, use_fast=False)
tokenizer_class = AutoTokenizer.from_pretrained(text_model_name_or_path, use_fast=False, **kwags)
elif "t5" in text_model_name_or_path:
tokenizer_class = T5Tokenizer.from_pretrained(text_model_name_or_path, use_fast=False)
tokenizer_class = T5Tokenizer.from_pretrained(text_model_name_or_path, use_fast=False, **kwags)
elif "llama" in text_model_name_or_path:
tokenizer_class = LlamaTokenizer.from_pretrained(text_model_name_or_path)
tokenizer_class = LlamaTokenizer.from_pretrained(text_model_name_or_path, **kwags)
tokenizer_class.pad_token = tokenizer_class.eos_token
elif "bloom" in text_model_name_or_path:
tokenizer_class = BloomTokenizer.from_pretrained(text_model_name_or_path)
tokenizer_class = BloomTokenizer.from_pretrained(text_model_name_or_path, **kwags)
tokenizer_class.pad_token = tokenizer_class.eos_token
else:
raise NotImplementedError
Expand Down
14 changes: 14 additions & 0 deletions paddlemix/processors/image_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@
)
from paddlemix.utils.log import logger

try:
from paddlenlp.transformers.aistudio_utils import aistudio_download
except:
logger.warning("aistudio_download not import, if you want to use , require paddlenlp develop")
aistudio_download = None
pass


IMAGE_PROCESSOR_NAME = "image_preprocessor_config.json"
TEXT_PROCESSOR_NAME = "text_processor_config.json"

Expand Down Expand Up @@ -272,6 +280,7 @@ def get_image_processor_dict(
"""
cache_dir = kwargs.pop("cache_dir", None)
from_hf_hub = kwargs.pop("from_hf_hub", False)
from_aistudio = kwargs.get("from_aistudio", False)
subfolder = kwargs.pop("subfolder", None)
cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)

Expand All @@ -292,6 +301,11 @@ def get_image_processor_dict(
library_name="PaddleNLP",
library_version=__version__,
)
elif from_aistudio and aistudio_download is not None:
image_processor_file = IMAGE_PROCESSOR_NAME
resolved_image_processor_file = aistudio_download(
repo_id=pretrained_model_name_or_path, filename=image_processor_file
)
else:
# Assuming from community-contributed pretrained models
image_processor_file = "/".join(
Expand Down
73 changes: 73 additions & 0 deletions paddlemix/processors/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
logger.warning("aistudio_download not import, if you want to use , require paddlenlp develop")
aistudio_download = None
pass
import aistudio_sdk

PROCESSOR_CONFIG_MAPPING = {
"image": "image_preprocessor_config.json",
Expand Down Expand Up @@ -228,6 +229,69 @@ def save_to_hf_hub(
create_pr=create_pr,
)

def save_to_aistudio(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

上传下载能力加下单测

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

self,
repo_id,
private=True,
license="Apache License 2.0",
exist_ok=True,
safe_serialization=True,
subfolder=None,
merge_tensor_parallel=False,
**kwargs
):
"""
Uploads all elements of this model to a new AiStudio Hub repository.
Args:
repo_id (str): Repository name for your model/tokenizer in the Hub.
token (str): Your token for the Hub.
private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
safe_serialization (bool, optional): Whether to save the model in safe serialization way. Defaults to: True.
subfolder (str, optional): Push to a subfolder of the repo instead of the root
merge_tensor_parallel (bool): Whether to merge the tensor parallel weights. Defaults to False.
"""

res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
if "error_code" in res:
if res["error_code"] == 10003 and exist_ok:
logger.info(
f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
)
else:
logger.error(
f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
)
else:
logger.info(f"Successfully created repo {repo_id}")

with tempfile.TemporaryDirectory() as root_dir:
if subfolder is not None:
save_dir = os.path.join(root_dir, subfolder)
else:
save_dir = root_dir

# save model
self.save_pretrained(save_dir)

# Upload model and return
logger.info(f"Pushing to the {repo_id}. This might take a while")
for filename in os.listdir(save_dir):
path_in_repo = os.path.join(subfolder, filename) if subfolder is not None else filename
res = aistudio_sdk.hub.upload(
repo_id=repo_id,
path_or_fileobj=os.path.join(save_dir, filename),
path_in_repo=path_in_repo,
**kwargs,
)
if "error_code" in res:
logger.error(
f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
)
else:
logger.info(f"{filename}: {res['message']}")

@classmethod
def get_processor_dict(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
Expand Down Expand Up @@ -273,6 +337,15 @@ def get_processor_dict(
)
elif from_aistudio and aistudio_download is not None:
processor_file = PROCESSOR_CONFIG_MAPPING[cls.input_type]
if subfolder is not None:
processor_file = os.path.join(subfolder, processor_file)

pretrained_model_name_or_path_list = pretrained_model_name_or_path.split("/")
if len(pretrained_model_name_or_path_list) > 2:
pretrained_model_name_or_path = os.path.join(
pretrained_model_name_or_path_list[0], pretrained_model_name_or_path_list[1]
)

resolved_processor_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=processor_file)
else:
# Assuming from community-contributed pretrained models
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ pycocoevalcap
ftfy
regex
einops>=0.6.1

aistudio-sdk>=0.1.3