Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Ola path for GPUs #562

Merged
merged 3 commits into from
Mar 1, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 40 additions & 38 deletions lmms_eval/models/ola.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def __init__(
)

self._config = self._model.config
self.model.to("cuda").eval().bfloat16()
self.model.to(self.device).eval().bfloat16()
self.model.tie_weights()
self.truncation = truncation
self.batch_size_per_gpu = int(batch_size)
Expand Down Expand Up @@ -207,10 +207,9 @@ def __init__(
elif accelerator.num_processes == 1 and device_map == "auto":
eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
self._rank = 0
self._word_size = 1
self._world_size = 1
else:
eval_logger.info(f"Using single device: {self._device}")
self.model.to(self._device)
self._rank = 0
self._world_size = 1
self.accelerator = accelerator
Expand Down Expand Up @@ -405,10 +404,11 @@ def process_audio(self, audio_array, sampling_rate):
audio = audio[:, 0]
target_sr = 16000
CHUNK_LIM = 480000
import librosa

if sampling_rate != target_sr:
speech_wav = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=target_sr).astype(np.float32)
else:
speech_wav = audio_array.astype(np.float32)
speechs = []
speech_wavs = []

Expand Down Expand Up @@ -485,13 +485,13 @@ def _collate(x):
eval_logger.info(f"Video {visuals} can not load, check the source")
continue
audio = self.extract_audio(visual)
audio.write_audiofile("./video_audio.wav")
video_audio_path = "./video_audio.wav"
audio.write_audiofile(f"./video_audio_{self.rank}.wav")
video_audio_path = f"./video_audio_{self.rank}.wav"
speech, speech_length, speech_chunk, speech_wav = self.load_audio(video_audio_path)
speechs.append(speech.bfloat16().to("cuda"))
speech_lengths.append(speech_length.to("cuda"))
speech_chunks.append(speech_chunk.to("cuda"))
speech_wavs.append(speech_wav.to("cuda"))
speechs.append(speech.bfloat16().to(self.device))
speech_lengths.append(speech_length.to(self.device))
speech_chunks.append(speech_chunk.to(self.device))
speech_wavs.append(speech_wav.to(self.device))
os.remove(video_audio_path)

# Process images of video
Expand All @@ -508,7 +508,7 @@ def _collate(x):
if frame_idx is None:
frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()

video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
video_processed = torch.cat(video_processed, dim=0).bfloat16().to(self.device)
video_processed = (video_processed, video_processed)

video_data = (video_processed, (384, 384), "video")
Expand All @@ -522,44 +522,46 @@ def _collate(x):
image_tensor_, image_highres_tensor_ = process_anyres_highres_image(visual, self._image_processor)
image_tensor.append(image_tensor_)
image_highres_tensor.append(image_highres_tensor_)
if all(x.shape == image_tensor[0].shape for x in image_tensor):
image_tensor = torch.stack(image_tensor, dim=0)
if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
if type(image_tensor) is list:
image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
else:
image_tensor = image_tensor.bfloat16().to("cuda")
if type(image_highres_tensor) is list:
image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
else:
image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")

# Processing dummy audio, as required by model
speechs.append(torch.zeros(1, 3000, 128).bfloat16().to("cuda"))
speech_lengths.append(torch.LongTensor([3000]).to("cuda"))
speech_wavs.append(torch.zeros([1, 480000]).to("cuda"))
speech_chunks.append(torch.LongTensor([1]).to("cuda"))

elif isinstance(visual, dict) and "array" in visual: # For Audio
if MODALITY is None:
MODALITY = "AUDIO"
mels, speech_length, speech_chunk, speech_wav = self.process_audio(visual["array"], visual["sampling_rate"])
speechs.append(mels.bfloat16().to("cuda"))
speech_lengths.append(speech_length.to("cuda"))
speech_chunks.append(speech_chunk.to("cuda"))
speech_wavs.append(speech_wav.to("cuda"))
speechs.append(mels.bfloat16().to(self.device))
speech_lengths.append(speech_length.to(self.device))
speech_chunks.append(speech_chunk.to(self.device))
speech_wavs.append(speech_wav.to(self.device))

# Processing dummy images, as required by model
images.append(torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device="cuda", non_blocking=True))
images_highres.append(torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device="cuda", non_blocking=True))
images.append(torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device=self.device, non_blocking=True))
images_highres.append(torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device=self.device, non_blocking=True))
image_sizes.append((224, 224))

if not video_processed and MODALITY == "VIDEO":
# If video is not processed, skip the iteration
pbar.update(1)
continue

if MODALITY == "IMAGE":
if all(x.shape == image_tensor[0].shape for x in image_tensor):
image_tensor = torch.stack(image_tensor, dim=0)
if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
if type(image_tensor) is list:
image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
else:
image_tensor = image_tensor.bfloat16().to("cuda")
if type(image_highres_tensor) is list:
image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
else:
image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")

# Processing dummy audio, as required by model
speechs.append(torch.zeros(1, 3000, 128).bfloat16().to("cuda"))
speech_lengths.append(torch.LongTensor([3000]).to("cuda"))
speech_wavs.append(torch.zeros([1, 480000]).to("cuda"))
speech_chunks.append(torch.LongTensor([1]).to("cuda"))

# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
Expand Down Expand Up @@ -601,11 +603,11 @@ def _collate(x):
eval_logger.debug(f"Prompt for doc ID {doc_id[0]}:\n\n{prompt}\n")

if MODALITY == "AUDIO":
input_ids = tokenizer_speech_token(prompt, self.tokenizer, SPEECH_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self._device)
input_ids = tokenizer_speech_token(prompt, self.tokenizer, SPEECH_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
elif MODALITY == "IMAGE":
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self._device)
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
elif MODALITY == "VIDEO":
input_ids = tokenizer_speech_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to("cuda")
input_ids = tokenizer_speech_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
pad_token_ids = 151643
attention_masks = input_ids.ne(pad_token_ids).long().to(self.device)

Expand Down