Skip to content

Commit

Permalink
fix moondream2
Browse files Browse the repository at this point in the history
  • Loading branch information
BBC-Esq authored Jul 27, 2024
1 parent e84bfd9 commit b81f681
Showing 1 changed file with 6 additions and 319 deletions.
325 changes: 6 additions & 319 deletions src/module_process_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from tqdm import tqdm
from transformers import (
AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, BlipForConditionalGeneration, BlipProcessor,
LlamaTokenizer, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, BitsAndBytesConfig
LlamaTokenizer, LlavaForConditionalGeneration, BitsAndBytesConfig
)

from langchain_community.docstore.document import Document
Expand All @@ -33,19 +33,8 @@
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger().setLevel(logging.WARNING)

# warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
# # logging.getLogger("transformers").setLevel(logging.CRITICAL)
# logging.getLogger("transformers").setLevel(logging.ERROR)
# logging.getLogger("transformers").setLevel(logging.WARNING)
# logging.getLogger("transformers").setLevel(logging.INFO)
# logging.getLogger("transformers").setLevel(logging.DEBUG)

ALLOWED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']

current_directory = Path(__file__).parent
CACHE_DIR = current_directory / "models" / "vision"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

current_directory = Path(__file__).parent
VISION_DIR = current_directory / "models" / "vision"
VISION_DIR.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -76,20 +65,10 @@ def choose_image_loader():

chosen_model = config["vision"]["chosen_model"]

if chosen_model in ['llava 1.5 - 7b', 'bakllava 1.5 - 7b', 'llava 1.5 - 13b', ]:
loader_func = loader_llava(config).process_images
elif chosen_model == 'Cogvlm':
loader_func = loader_cogvlm(config).process_images
elif chosen_model == 'Moondream2':
if chosen_model == 'Moondream2':
loader_func = loader_moondream(config).process_images
elif chosen_model in ["Florence-2-large", "Florence-2-base"]:
loader_func = loader_florence2(config).process_images
elif chosen_model == 'Phi-3-vision-128k-instruct':
loader_func = loader_phi3vision(config).process_images
elif chosen_model == 'MiniCPM-Llama3-V-2_5-int4':
loader_func = loader_minicpm_llama3v(config).process_images
elif chosen_model in ['Llava 1.6 Vicuna - 7b', 'Llava 1.6 Vicuna - 13b']:
loader_func = loader_llava_next(config).process_images
else:
my_cprint("No valid image model specified in config.yaml", "red")
return []
Expand Down Expand Up @@ -164,125 +143,6 @@ def process_images(self):
def process_single_image(self, raw_image):
raise NotImplementedError("Subclasses must implement process_single_image method")

class loader_cogvlm(BaseLoader):
def initialize_model_and_tokenizer(self):
model_name = 'THUDM/cogvlm-chat-hf'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=TORCH_TYPE)

tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config,
torch_dtype=TORCH_TYPE,
low_cpu_mem_usage=True,
trust_remote_code=True
)

my_cprint(f"Cogvlm vision model loaded into memory...", "green")
return model, tokenizer, None

@torch.inference_mode()
def process_single_image(self, raw_image):
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
inputs = self.model.build_conversation_input_ids(self.tokenizer, query=prompt, history=[], images=[raw_image])
inputs = {
'input_ids': inputs['input_ids'].unsqueeze(0).to(self.device),
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(self.device),
'attention_mask': inputs['attention_mask'].unsqueeze(0).to(self.device),
'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
}

gen_kwargs = {"max_length": 2048, "do_sample": False}
output = self.model.generate(**inputs, **gen_kwargs)
output = output[:, inputs['input_ids'].shape[1]:]
model_response = self.tokenizer.decode(output[0], skip_special_tokens=True).split("ASSISTANT: ")[-1]
return model_response

class loader_llava(BaseLoader):
def initialize_model_and_tokenizer(self):
chosen_model = self.config['vision']['chosen_model']

model_info = VISION_MODELS[chosen_model]
model_id = model_info['repo_id']
precision = model_info['precision']
save_dir = model_info["cache_dir"]
cache_dir = CACHE_DIR / save_dir
cache_dir.mkdir(parents=True, exist_ok=True)

quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = LlavaForConditionalGeneration.from_pretrained(
model_id,
quantization_config=quantization_config,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
cache_dir=cache_dir
)

my_cprint(f"{chosen_model} vision model loaded into memory...", "green")

processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir)

return model, None, processor

@torch.inference_mode()
def process_single_image(self, raw_image):
prompt = "USER: <image>\nDescribe this image in as much detail as possible while still trying to be succinct and not repeat yourself.\nASSISTANT:"
inputs = self.processor(prompt, raw_image, return_tensors='pt').to(self.device)
inputs = inputs.to(torch.float32)

output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False)
full_response = self.processor.decode(output[0][2:], skip_special_tokens=True, do_sample=False)
model_response = full_response.split("ASSISTANT: ")[-1]
return model_response


class loader_llava_next(BaseLoader):
def initialize_model_and_tokenizer(self):
chosen_model = self.config['vision']['chosen_model']

model_info = VISION_MODELS[chosen_model]
model_id = model_info['repo_id']
precision = model_info['precision']
save_dir = model_info["cache_dir"]
cache_dir = CACHE_DIR / save_dir
cache_dir.mkdir(parents=True, exist_ok=True)

quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)

model = LlavaNextForConditionalGeneration.from_pretrained(
model_id,
quantization_config=quantization_config,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
cache_dir=cache_dir
)

my_cprint(f"{chosen_model} vision model loaded into memory...", "green")

processor = LlavaNextProcessor.from_pretrained(model_id, cache_dir=cache_dir)

return model, None, processor

@ torch.inference_mode()
def process_single_image(self, raw_image):
user_prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
prompt = f"USER: <image>\n{user_prompt} ASSISTANT:"
inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(self.device)

output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False)

response = self.processor.decode(output[0], skip_special_tokens=True) # possibly adjust to "full_response = self.processor.decode(output[0][2:], skip_special_tokens=True)" or something similar if output is preceded by special tokens inexplicatly
model_response = response.split("ASSISTANT:")[-1].strip()

return model_response


class loader_moondream(BaseLoader):
def initialize_model_and_tokenizer(self):
Expand Down Expand Up @@ -320,14 +180,11 @@ def __init__(self, config):

def initialize_model_and_tokenizer(self):
chosen_model = self.config['vision']['chosen_model']
repo_id = VISION_MODELS[chosen_model]["repo_id"]
save_dir = VISION_MODELS[chosen_model]["cache_dir"]

cache_dir = CACHE_DIR / save_dir
cache_dir.mkdir(parents=True, exist_ok=True)
model_id = VISION_MODELS[chosen_model]['repo_id']
cache_dir=VISION_DIR

model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir)

device_type, precision_type = self.get_device_and_precision()

Expand Down Expand Up @@ -371,173 +228,3 @@ def process_single_image(self, raw_image):
parsed_answer = self.processor.post_process_generation(generated_text, task=prompt, image_size=(raw_image.width, raw_image.height))

return parsed_answer['<MORE_DETAILED_CAPTION>']


class loader_phi3vision(BaseLoader):
def initialize_model_and_tokenizer(self):
chosen_model = self.config['vision']['chosen_model']
repo_id = VISION_MODELS[chosen_model]["repo_id"]
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
cache_dir = CACHE_DIR / save_dir
cache_dir.mkdir(parents=True, exist_ok=True)

quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)

# microsoft/Phi-3-vision-128k-instruct
model = AutoModelForCausalLM.from_pretrained(
repo_id,
device_map="cuda",
trust_remote_code=True,
torch_dtype="auto",
attn_implementation='flash_attention_2',
quantization_config=quantization_config,
cache_dir=cache_dir
)

processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)

my_cprint(f"Microsoft-Phi-3-vision model loaded into memory...", "green")

return model, None, processor

@torch.inference_mode()
def process_single_image(self, raw_image):
prompt = f"""<|user|>
<|image_1|>
Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.<|end|>
<|assistant|>
"""
inputs = self.processor(prompt, [raw_image], return_tensors="pt").to(self.device)

generation_args = {
"max_new_tokens": 500,
"temperature": None,
"do_sample": False,
}

generate_ids = self.model.generate(
**inputs,
eos_token_id=self.processor.tokenizer.eos_token_id,
**generation_args
)

generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = self.processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]

return response


class loader_minicpm_llama3v(BaseLoader):
def initialize_model_and_tokenizer(self):
chosen_model = self.config['vision']['chosen_model']
repo_id = VISION_MODELS[chosen_model]["repo_id"]
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
cache_dir = CACHE_DIR / save_dir
cache_dir.mkdir(parents=True, exist_ok=True)

warnings.filterwarnings("ignore", category=UserWarning)

# openbmb/MiniCPM-Llama3-V-2_5-int4
model = AutoModel.from_pretrained(
repo_id,
trust_remote_code=True,
low_cpu_mem_usage=True,
cache_dir=cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(
repo_id,
trust_remote_code=True,
cache_dir=cache_dir
)
model.eval()

my_cprint(f"MiniCPM-Llama3-V vision model loaded into memory...", "green")

return model, tokenizer, None

@torch.inference_mode()
def process_single_image(self, raw_image):
question = 'Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.'
msgs = [{'role': 'user', 'content': question}]

response = self.model.chat(
image=raw_image,
msgs=msgs,
context=None,
tokenizer=self.tokenizer,
sampling=False,
temperature=None
)

if isinstance(response, tuple) and len(response) == 3:
res, context, _ = response
else:
res = response

return res

'''
class loader_bunny(BaseLoader):
def initialize_model_and_tokenizer(self):
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')
#BAAI/Bunny-v1_1-4B
# BAAI/Bunny-v1_1-Llama-3-8B-V
chosen_model = self.config['vision']['chosen_model']
model_path = VISION_MODELS[chosen_model]["model_path"]
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4"
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map='auto',
trust_remote_code=True,
quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True
)
my_cprint(f"Bunny vision model loaded into memory...", "green")
return model, tokenizer, None
@torch.inference_mode()
def process_single_image(self, raw_image):
prompt = "Describe what this image depicts in as much detail as possible."
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(self.device)
image_tensor = self.model.process_images([raw_image], self.model.config).to(dtype=self.model.dtype, device=self.device)
output_ids = self.model.generate(
input_ids,
images=image_tensor,
max_length=4096,
use_cache=True,
repetition_penalty=1.0
)[0].to(self.device)
result = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
return result
'''

0 comments on commit b81f681

Please sign in to comment.