diff --git a/src/module_process_images.py b/src/module_process_images.py index 4a4010e..ecffdfc 100644 --- a/src/module_process_images.py +++ b/src/module_process_images.py @@ -15,7 +15,7 @@ from tqdm import tqdm from transformers import ( AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, BlipForConditionalGeneration, BlipProcessor, - LlamaTokenizer, LlavaForConditionalGeneration, BitsAndBytesConfig + LlamaTokenizer, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, BitsAndBytesConfig ) from langchain_community.docstore.document import Document @@ -33,8 +33,19 @@ warnings.filterwarnings("ignore", category=DeprecationWarning) logging.getLogger().setLevel(logging.WARNING) +# warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*") +# # logging.getLogger("transformers").setLevel(logging.CRITICAL) +# logging.getLogger("transformers").setLevel(logging.ERROR) +# logging.getLogger("transformers").setLevel(logging.WARNING) +# logging.getLogger("transformers").setLevel(logging.INFO) +# logging.getLogger("transformers").setLevel(logging.DEBUG) + ALLOWED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff'] +current_directory = Path(__file__).parent +CACHE_DIR = current_directory / "models" / "vision" +CACHE_DIR.mkdir(parents=True, exist_ok=True) + current_directory = Path(__file__).parent VISION_DIR = current_directory / "models" / "vision" VISION_DIR.mkdir(parents=True, exist_ok=True) @@ -65,10 +76,20 @@ def choose_image_loader(): chosen_model = config["vision"]["chosen_model"] - if chosen_model == 'Moondream2': + if chosen_model in ['llava 1.5 - 7b', 'bakllava 1.5 - 7b', 'llava 1.5 - 13b', ]: + loader_func = loader_llava(config).process_images + elif chosen_model == 'Cogvlm': + loader_func = loader_cogvlm(config).process_images + elif chosen_model == 'Moondream2': loader_func = loader_moondream(config).process_images elif chosen_model in ["Florence-2-large", "Florence-2-base"]: loader_func = loader_florence2(config).process_images + elif chosen_model == 'Phi-3-vision-128k-instruct': + loader_func = loader_phi3vision(config).process_images + elif chosen_model == 'MiniCPM-Llama3-V-2_5-int4': + loader_func = loader_minicpm_llama3v(config).process_images + elif chosen_model in ['Llava 1.6 Vicuna - 7b', 'Llava 1.6 Vicuna - 13b']: + loader_func = loader_llava_next(config).process_images else: my_cprint("No valid image model specified in config.yaml", "red") return [] @@ -143,6 +164,125 @@ def process_images(self): def process_single_image(self, raw_image): raise NotImplementedError("Subclasses must implement process_single_image method") +class loader_cogvlm(BaseLoader): + def initialize_model_and_tokenizer(self): + model_name = 'THUDM/cogvlm-chat-hf' + TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=TORCH_TYPE) + + tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5') + model = AutoModelForCausalLM.from_pretrained( + model_name, + quantization_config=quantization_config, + torch_dtype=TORCH_TYPE, + low_cpu_mem_usage=True, + trust_remote_code=True + ) + + my_cprint(f"Cogvlm vision model loaded into memory...", "green") + return model, tokenizer, None + + @torch.inference_mode() + def process_single_image(self, raw_image): + TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself." + inputs = self.model.build_conversation_input_ids(self.tokenizer, query=prompt, history=[], images=[raw_image]) + inputs = { + 'input_ids': inputs['input_ids'].unsqueeze(0).to(self.device), + 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(self.device), + 'attention_mask': inputs['attention_mask'].unsqueeze(0).to(self.device), + 'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]], + } + + gen_kwargs = {"max_length": 2048, "do_sample": False} + output = self.model.generate(**inputs, **gen_kwargs) + output = output[:, inputs['input_ids'].shape[1]:] + model_response = self.tokenizer.decode(output[0], skip_special_tokens=True).split("ASSISTANT: ")[-1] + return model_response + +class loader_llava(BaseLoader): + def initialize_model_and_tokenizer(self): + chosen_model = self.config['vision']['chosen_model'] + + model_info = VISION_MODELS[chosen_model] + model_id = model_info['repo_id'] + precision = model_info['precision'] + save_dir = model_info["cache_dir"] + cache_dir = CACHE_DIR / save_dir + cache_dir.mkdir(parents=True, exist_ok=True) + + quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) + + model = LlavaForConditionalGeneration.from_pretrained( + model_id, + quantization_config=quantization_config, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + cache_dir=cache_dir + ) + + my_cprint(f"{chosen_model} vision model loaded into memory...", "green") + + processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir) + + return model, None, processor + + @torch.inference_mode() + def process_single_image(self, raw_image): + prompt = "USER: \nDescribe this image in as much detail as possible while still trying to be succinct and not repeat yourself.\nASSISTANT:" + inputs = self.processor(prompt, raw_image, return_tensors='pt').to(self.device) + inputs = inputs.to(torch.float32) + + output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False) + full_response = self.processor.decode(output[0][2:], skip_special_tokens=True, do_sample=False) + model_response = full_response.split("ASSISTANT: ")[-1] + return model_response + + +class loader_llava_next(BaseLoader): + def initialize_model_and_tokenizer(self): + chosen_model = self.config['vision']['chosen_model'] + + model_info = VISION_MODELS[chosen_model] + model_id = model_info['repo_id'] + precision = model_info['precision'] + save_dir = model_info["cache_dir"] + cache_dir = CACHE_DIR / save_dir + cache_dir.mkdir(parents=True, exist_ok=True) + + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + ) + + model = LlavaNextForConditionalGeneration.from_pretrained( + model_id, + quantization_config=quantization_config, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + cache_dir=cache_dir + ) + + my_cprint(f"{chosen_model} vision model loaded into memory...", "green") + + processor = LlavaNextProcessor.from_pretrained(model_id, cache_dir=cache_dir) + + return model, None, processor + + @ torch.inference_mode() + def process_single_image(self, raw_image): + user_prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself." + prompt = f"USER: \n{user_prompt} ASSISTANT:" + inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(self.device) + + output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False) + + response = self.processor.decode(output[0], skip_special_tokens=True) # possibly adjust to "full_response = self.processor.decode(output[0][2:], skip_special_tokens=True)" or something similar if output is preceded by special tokens inexplicatly + model_response = response.split("ASSISTANT:")[-1].strip() + + return model_response + class loader_moondream(BaseLoader): def initialize_model_and_tokenizer(self): @@ -159,7 +299,7 @@ def initialize_model_and_tokenizer(self): my_cprint(f"Moondream2 vision model loaded into memory...", "green") - tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-05-20", cache_dir=cache_dir) + tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-07-23", cache_dir=cache_dir) return model, tokenizer, None @@ -180,11 +320,14 @@ def __init__(self, config): def initialize_model_and_tokenizer(self): chosen_model = self.config['vision']['chosen_model'] - model_id = VISION_MODELS[chosen_model]['repo_id'] - cache_dir=VISION_DIR + repo_id = VISION_MODELS[chosen_model]["repo_id"] + save_dir = VISION_MODELS[chosen_model]["cache_dir"] - model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir) - processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir) + cache_dir = CACHE_DIR / save_dir + cache_dir.mkdir(parents=True, exist_ok=True) + + model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir) + processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir) device_type, precision_type = self.get_device_and_precision() @@ -228,3 +371,173 @@ def process_single_image(self, raw_image): parsed_answer = self.processor.post_process_generation(generated_text, task=prompt, image_size=(raw_image.width, raw_image.height)) return parsed_answer[''] + + +class loader_phi3vision(BaseLoader): + def initialize_model_and_tokenizer(self): + chosen_model = self.config['vision']['chosen_model'] + repo_id = VISION_MODELS[chosen_model]["repo_id"] + save_dir = VISION_MODELS[chosen_model]["cache_dir"] + cache_dir = CACHE_DIR / save_dir + cache_dir.mkdir(parents=True, exist_ok=True) + + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" + ) + + # microsoft/Phi-3-vision-128k-instruct + model = AutoModelForCausalLM.from_pretrained( + repo_id, + device_map="cuda", + trust_remote_code=True, + torch_dtype="auto", + attn_implementation='flash_attention_2', + quantization_config=quantization_config, + cache_dir=cache_dir + ) + + processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir) + + my_cprint(f"Microsoft-Phi-3-vision model loaded into memory...", "green") + + return model, None, processor + + @torch.inference_mode() + def process_single_image(self, raw_image): + prompt = f"""<|user|> +<|image_1|> +Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.<|end|> +<|assistant|> +""" + inputs = self.processor(prompt, [raw_image], return_tensors="pt").to(self.device) + + generation_args = { + "max_new_tokens": 500, + "temperature": None, + "do_sample": False, + } + + generate_ids = self.model.generate( + **inputs, + eos_token_id=self.processor.tokenizer.eos_token_id, + **generation_args + ) + + generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] + response = self.processor.batch_decode( + generate_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False + )[0] + + return response + + +class loader_minicpm_llama3v(BaseLoader): + def initialize_model_and_tokenizer(self): + chosen_model = self.config['vision']['chosen_model'] + repo_id = VISION_MODELS[chosen_model]["repo_id"] + save_dir = VISION_MODELS[chosen_model]["cache_dir"] + cache_dir = CACHE_DIR / save_dir + cache_dir.mkdir(parents=True, exist_ok=True) + + warnings.filterwarnings("ignore", category=UserWarning) + + # openbmb/MiniCPM-Llama3-V-2_5-int4 + model = AutoModel.from_pretrained( + repo_id, + trust_remote_code=True, + low_cpu_mem_usage=True, + cache_dir=cache_dir + ) + tokenizer = AutoTokenizer.from_pretrained( + repo_id, + trust_remote_code=True, + cache_dir=cache_dir + ) + model.eval() + + my_cprint(f"MiniCPM-Llama3-V vision model loaded into memory...", "green") + + return model, tokenizer, None + + @torch.inference_mode() + def process_single_image(self, raw_image): + question = 'Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.' + msgs = [{'role': 'user', 'content': question}] + + response = self.model.chat( + image=raw_image, + msgs=msgs, + context=None, + tokenizer=self.tokenizer, + sampling=False, + temperature=None + ) + + if isinstance(response, tuple) and len(response) == 3: + res, context, _ = response + else: + res = response + + return res + +''' +class loader_bunny(BaseLoader): + def initialize_model_and_tokenizer(self): + transformers.logging.set_verbosity_error() + transformers.logging.disable_progress_bar() + warnings.filterwarnings('ignore') + + #BAAI/Bunny-v1_1-4B + # BAAI/Bunny-v1_1-Llama-3-8B-V + + chosen_model = self.config['vision']['chosen_model'] + model_path = VISION_MODELS[chosen_model]["model_path"] + + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_quant_type="nf4" + ) + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + device_map='auto', + trust_remote_code=True, + quantization_config=quantization_config + ) + + tokenizer = AutoTokenizer.from_pretrained( + model_path, + trust_remote_code=True + ) + + my_cprint(f"Bunny vision model loaded into memory...", "green") + + return model, tokenizer, None + + @torch.inference_mode() + def process_single_image(self, raw_image): + prompt = "Describe what this image depicts in as much detail as possible." + text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{prompt} ASSISTANT:" + + text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('')] + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(self.device) + + image_tensor = self.model.process_images([raw_image], self.model.config).to(dtype=self.model.dtype, device=self.device) + + output_ids = self.model.generate( + input_ids, + images=image_tensor, + max_length=4096, + use_cache=True, + repetition_penalty=1.0 + )[0].to(self.device) + + result = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() + return result +''' \ No newline at end of file