From 254707b53be28d6fcdaf6acfb8ac270ac06a77d4 Mon Sep 17 00:00:00 2001 From: Tejaswini Jayashanker Date: Tue, 9 Dec 2025 09:03:32 +0530 Subject: [PATCH 1/2] Updated readme.md Signed-off-by: Tejaswini Jayashanker --- readme.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/readme.md b/readme.md index 75a58bd..0802e8c 100644 --- a/readme.md +++ b/readme.md @@ -2,17 +2,17 @@ # pipline of the data engine -### read the grounding data from json file +### read the grounding data from the JSON file for each sample, per-store the others samples sharing the same image. add the -### model predict and save the jons files +### model predict and save the JSON files - visual to check the json files - found that some boxes are overlapped heavily, with different text - how to deal with these boxes? ### merge model prediction to label, -- discard the bbox with higher iou ( > 0.8, higher iou , no consider the class or text) +- discard the bbox with higher iou ( > 0.8, higher iou, no consider the class or text) @@ -21,7 +21,7 @@ - generate the visual prompt embedding for each instance (bbox) -- merge bboxes within the same image ( consider the vpe distance and text similarity ,bbox iou<0.8 ) +- merge bboxes within the same image ( consider the vpe distance and text similarity,bbox iou<0.8 ) - transfer to grounding format cache for training @@ -31,4 +31,4 @@ to do: -write a tools to visual the bbox ious. within the same images \ No newline at end of file +Write a tool to visualize the BBox IoUs within the same image. From 3e6afe343285a6d4377dbddaa842a8b34271d854 Mon Sep 17 00:00:00 2001 From: UltralyticsAssistant Date: Tue, 9 Dec 2025 03:34:20 +0000 Subject: [PATCH 2/2] Auto-format by https://ultralytics.com/actions --- data_engine.py | 549 +++++++++++++++----------------- data_engine_agent.py | 247 +++++++------- data_visual.py | 42 +-- data_visual_flickr.py | 39 +-- data_visual_mixed.py | 37 +-- data_visual_object365.py | 34 +- do_flickr.sh | 11 +- do_mixed.sh | 14 +- grounding_dataset_visualizer.py | 236 +++++++------- log.md | 14 +- readme.md | 32 +- refine_text.py | 240 ++++++-------- remove_segment.py | 34 +- utils.py | 23 +- visual_json.py | 60 ++-- 15 files changed, 742 insertions(+), 870 deletions(-) diff --git a/data_engine.py b/data_engine.py index b2bd18d..1b37420 100644 --- a/data_engine.py +++ b/data_engine.py @@ -1,39 +1,43 @@ -import ultralytics,os +import os + +import ultralytics + workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) -import numpy as np -from pathlib import Path import os -from PIL import Image, ImageDraw, ImageFont +from pathlib import Path +import numpy as np +from PIL import Image from ultralytics.data.utils import load_dataset_cache_file from ultralytics.engine.results import Results os.chdir(os.path.dirname(os.path.abspath(__file__))) + def get_names_from_yaml_config(yaml_config): import yaml + if not os.path.exists(yaml_config): raise FileNotFoundError(f"YAML config file not found: {yaml_config}") - with open(yaml_config, 'r') as f: + with open(yaml_config) as f: data_dict = yaml.safe_load(f) - names = data_dict['names'] + names = data_dict["names"] return names -class YoloBox(object): - - def __init__(self,img_shape:list): - assert len(img_shape)==2, "img_sz should be (height,width)" - self.img_h=img_shape[0] - self.img_w=img_shape[1] - self.xyxy=None - self.xywhn=None # normalized xywh +class YoloBox: + def __init__(self, img_shape: list): + assert len(img_shape) == 2, "img_sz should be (height,width)" + self.img_h = img_shape[0] + self.img_w = img_shape[1] + self.xyxy = None + self.xywhn = None # normalized xywh - def load_from_xywhn_normalized(self,bboxes_xywhn): + def load_from_xywhn_normalized(self, bboxes_xywhn): # xywhn: [N,4] x_center,y_center,w,h (normalized) bboxes_xyxy = np.zeros_like(bboxes_xywhn) if bboxes_xywhn.shape[0] > 0: @@ -42,14 +46,15 @@ def load_from_xywhn_normalized(self,bboxes_xywhn): bboxes_xyxy[:, 2] = (bboxes_xywhn[:, 0] + bboxes_xywhn[:, 2] / 2) * self.img_w bboxes_xyxy[:, 3] = (bboxes_xywhn[:, 1] + bboxes_xywhn[:, 3] / 2) * self.img_h - self.xyxy=bboxes_xyxy + self.xyxy = bboxes_xyxy - self.xywhn=bboxes_xywhn + self.xywhn = bboxes_xywhn return self - def load_from_xyxy(self,bboxes_xyxy): - if isinstance(bboxes_xyxy,list): - bboxes_xyxy=np.array(bboxes_xyxy) - + + def load_from_xyxy(self, bboxes_xyxy): + if isinstance(bboxes_xyxy, list): + bboxes_xyxy = np.array(bboxes_xyxy) + # Ensure the array is of a numeric type bboxes_xyxy = np.array(bboxes_xyxy, dtype=np.float32) @@ -68,367 +73,350 @@ def load_from_xyxy(self,bboxes_xyxy): bboxes_xywhn[:, 2] = (bboxes_xyxy[:, 2] - bboxes_xyxy[:, 0]) / self.img_w bboxes_xywhn[:, 3] = (bboxes_xyxy[:, 3] - bboxes_xyxy[:, 1]) / self.img_h - self.xyxy=bboxes_xyxy + self.xyxy = bboxes_xyxy - self.xywhn=bboxes_xywhn + self.xywhn = bboxes_xywhn return self - - def iou(self,bbox_xyxy): + + def iou(self, bbox_xyxy): # bbox_xyxy: [4,] x0,y0,x1,y1 assert self.xyxy is not None, "self.xyxy is None, please load the box first" - ious=[] + ious = [] for i in range(self.xyxy.shape[0]): - box=self.xyxy[i] - xi1=max(box[0],bbox_xyxy[0]) - yi1=max(box[1],bbox_xyxy[1]) - xi2=min(box[2],bbox_xyxy[2]) - yi2=min(box[3],bbox_xyxy[3]) - inter_area=max(0,xi2-xi1)*max(0,yi2-yi1) - box1_area=(box[2]-box[0])*(box[3]-box[1]) - box2_area=(bbox_xyxy[2]-bbox_xyxy[0])*(bbox_xyxy[3]-bbox_xyxy[1]) - union_area=box1_area+box2_area-inter_area - iou=inter_area/union_area if union_area>0 else 0 + box = self.xyxy[i] + xi1 = max(box[0], bbox_xyxy[0]) + yi1 = max(box[1], bbox_xyxy[1]) + xi2 = min(box[2], bbox_xyxy[2]) + yi2 = min(box[3], bbox_xyxy[3]) + inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1) + box1_area = (box[2] - box[0]) * (box[3] - box[1]) + box2_area = (bbox_xyxy[2] - bbox_xyxy[0]) * (bbox_xyxy[3] - bbox_xyxy[1]) + union_area = box1_area + box2_area - inter_area + iou = inter_area / union_area if union_area > 0 else 0 ious.append(iou) return np.array(ious) import torch + class DataEngine: - - def __init__(self,device="cuda"): - self.device=device + def __init__(self, device="cuda"): + self.device = device def load_yoloe(self): from ultralytics import YOLOE - model_path="/root/ultra_louis_work/ultralytics/yoloe-11l-seg.pt" - yaml_file="yoloe-11l-seg.yaml" - if hasattr(self,'model'): + model_path = "/root/ultra_louis_work/ultralytics/yoloe-11l-seg.pt" + yaml_file = "yoloe-11l-seg.yaml" + if hasattr(self, "model"): # clear the existing model del self.model torch.cuda.empty_cache() - - self.model=YOLOE(yaml_file).load(model_path).to(self.device) + self.model = YOLOE(yaml_file).load(model_path).to(self.device) print("load model from:", model_path) - def set_classes(self,yaml_config=None,name_list=None, text_embed_pt=None): - + def set_classes(self, yaml_config=None, name_list=None, text_embed_pt=None): # only one of yaml_config and name_list should be provided assert (yaml_config is None) or (name_list is None), "Only one of yaml_config and name_list should be provided" if yaml_config is not None: - assert name_list is None, "If yaml_config is provided, name_list should be None" name_list = get_names_from_yaml_config(yaml_config) - name_list=list(name_list.values()) + name_list = list(name_list.values()) print("Load names from yaml:", yaml_config) if text_embed_pt is not None: assert os.path.exists(text_embed_pt), f"Text embed pt file not found: {text_embed_pt}" - txt_map= torch.load(text_embed_pt, map_location=self.device) - name_list=list(txt_map.keys()) + txt_map = torch.load(text_embed_pt, map_location=self.device) + name_list = list(txt_map.keys()) print("Load text embed from:", text_embed_pt) + assert name_list is None or isinstance(name_list, list), "name_list should be a list of strings or None" - assert name_list is None or isinstance(name_list,list), "name_list should be a list of strings or None" - - if name_list is not None : + if name_list is not None: print(f"Set {len(name_list)} classes") self.model.set_classes(name_list, self.model.get_text_pe(name_list)) - self.names=name_list + self.names = name_list else: print("No classes set") - - - def yoloe_predict(self,indice,conf=0.05,save_path=None): - img_file=self.labels[indice]['im_file'] - if hasattr(self,'img_source'): - img_file=os.path.join(self.img_source,img_file) - result=self.model.predict(img_file,conf=conf) + def yoloe_predict(self, indice, conf=0.05, save_path=None): + img_file = self.labels[indice]["im_file"] + if hasattr(self, "img_source"): + img_file = os.path.join(self.img_source, img_file) + result = self.model.predict(img_file, conf=conf) if save_path is not None: result[0].save(save_path) print("save to:", save_path) return result - def yoloe_predict_batch(self, labels, conf=0.05,iou=0.4): - img_files=[] + def yoloe_predict_batch(self, labels, conf=0.05, iou=0.4): + img_files = [] for label in labels: - img_file=label['im_file'] - if hasattr(self,'img_source'): - img_file=os.path.join(self.img_source,img_file) + img_file = label["im_file"] + if hasattr(self, "img_source"): + img_file = os.path.join(self.img_source, img_file) img_files.append(img_file) if not img_files: return [] - - return list(self.model.predict(img_files, conf=conf,iou=iou, batch=len(img_files),stream=True)) + return list(self.model.predict(img_files, conf=conf, iou=iou, batch=len(img_files), stream=True)) def __len__(self): return len(self.labels) + def set_img_folder(self, img_source): + self.img_source = img_source + def load_cached_label(self, cache_path, data_style="grounding", yaml_config=None, text_embed_pt=None): + self.cache_path = cache_path - - def set_img_folder(self,img_source): - self.img_source=img_source - - def load_cached_label(self,cache_path, data_style="grounding",yaml_config=None, text_embed_pt=None ): - - self.cache_path=cache_path - - cache=load_dataset_cache_file(Path(cache_path)) - self.cache=cache - self.labels=cache["labels"] + cache = load_dataset_cache_file(Path(cache_path)) + self.cache = cache + self.labels = cache["labels"] print(len(self.labels)) - assert data_style in ["grounding","detection"] - self.data_style=data_style - - if data_style=="detection": + assert data_style in ["grounding", "detection"] + self.data_style = data_style + + if data_style == "detection": assert yaml_config is not None, "yaml_config must be provided for detection data_style" if not os.path.exists(yaml_config): raise FileNotFoundError(f"YAML config file not found: {yaml_config}") - self.yaml_config=yaml_config + self.yaml_config = yaml_config - # read names from the yaml file + # read names from the yaml file import yaml - with open(yaml_config, 'r') as f: + + with open(yaml_config) as f: data_dict = yaml.safe_load(f) - self.names = data_dict['names'] + self.names = data_dict["names"] - elif data_style=="grounding": + elif data_style == "grounding": assert text_embed_pt is not None, "text_embed_pt must be provided for grounding data_style" - self.text_embed_pt=text_embed_pt + self.text_embed_pt = text_embed_pt if not os.path.exists(text_embed_pt): raise FileNotFoundError(f"Text embed pt file not found: {text_embed_pt}") else: print("Load text embed from:", text_embed_pt) txt_map = torch.load(text_embed_pt, map_location=self.device, weights_only=False) - self.names=list(txt_map.keys()) - + self.names = list(txt_map.keys()) def print_data_info(self): - """ - Print information about the dataset labels: - - data_style - - Total number of labels - - Total number of boxes (for detection and grounding) + """Print information about the dataset labels: - data_style - Total number of labels - Total number of boxes + (for detection and grounding). """ print(f"Data style: {self.data_style}") - print("Keys: {}".format(self.labels[0].keys() if len(self.labels)>0 else "No labels") ) + print("Keys: {}".format(self.labels[0].keys() if len(self.labels) > 0 else "No labels")) print(f"Total number of labels: {len(self.labels)}") if self.data_style in ["detection", "grounding"]: total_boxes = sum(len(label.get("bboxes", [])) for label in self.labels) print(f"Total number of boxes: {total_boxes}") - def remove_masks_and_segments(self): for label in tqdm(self.labels): - label["segments"]=[] + label["segments"] = [] - def save_cached_label(self,save_path=None): + def save_cached_label(self, save_path=None): if save_path is None: - save_path=self.cache_path + save_path = self.cache_path from copy import deepcopy - copy_cache=deepcopy(self.cache) - copy_cache["labels"]=self.labels + + copy_cache = deepcopy(self.cache) + copy_cache["labels"] = self.labels with open(save_path, "wb") as f: np.save(f, copy_cache) - def print_one_label(self,indice): - - label=self.labels[indice] + def print_one_label(self, indice): + label = self.labels[indice] # print(self.labels[indice]) print(label.keys()) - print(label['im_file']) - for key,val in label.items(): + print(label["im_file"]) + for key, val in label.items(): print(f"{key}: {type(val)}") - if isinstance(val,list): + if isinstance(val, list): print(f" Length: {len(val)}") # if len(val)>0: # print(f" First 3 elements: {val[:3]}") - elif isinstance(val,np.ndarray): + elif isinstance(val, np.ndarray): print(f" Shape: {val.shape}") print(f" Dtype: {val.dtype}") print(f" First 5 elements: {val.flatten()[:5]}") - - elif isinstance(val,dict): + + elif isinstance(val, dict): print(f" Dict with keys: {list(val.keys())}") else: print(f" Value: {val}") - - def detection_predict_and_update_labels(self,indice,iou=0.3,replace=True,conf=0.1): - result=self.yoloe_predict(indice=indice,conf=conf) + def detection_predict_and_update_labels(self, indice, iou=0.3, replace=True, conf=0.1): + result = self.yoloe_predict(indice=indice, conf=conf) if not result: return - self._update_detection_label(indice,result[0],iou=iou,replace=replace) + self._update_detection_label(indice, result[0], iou=iou, replace=replace) def detection_predict_and_update_labels_batch(self, indices, iou=0.3, replace=False, conf=0.1): - results=self.yoloe_predict_batch([ self.labels[i] for i in indices ], conf=conf) - assert len(results)==len(indices), "Mismatch between results and indices length" - for indice,res in zip(indices,results): - self._update_detection_label(indice,res,iou=iou,replace=replace) + results = self.yoloe_predict_batch([self.labels[i] for i in indices], conf=conf) + assert len(results) == len(indices), "Mismatch between results and indices length" + for indice, res in zip(indices, results): + self._update_detection_label(indice, res, iou=iou, replace=replace) def _update_detection_label(self, indice, result_obj, iou=0.3, replace=True): assert self.data_style == "detection", "_update_detection_label requires detection data_style" - boxes=result_obj.boxes - bboxes_xyxy=boxes.xyxy.cpu().numpy() - yolo_box=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy) - bboxes_xywhn=yolo_box.xywhn - cls=boxes.cls.cpu().numpy() - assert bboxes_xywhn.shape[0]==cls.shape[0], "Mismatch between number of boxes and classes" + boxes = result_obj.boxes + bboxes_xyxy = boxes.xyxy.cpu().numpy() + yolo_box = YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy) + bboxes_xywhn = yolo_box.xywhn + cls = boxes.cls.cpu().numpy() + assert bboxes_xywhn.shape[0] == cls.shape[0], "Mismatch between number of boxes and classes" if replace: - self.labels[indice]['bboxes']=bboxes_xywhn - self.labels[indice]['cls']=cls + self.labels[indice]["bboxes"] = bboxes_xywhn + self.labels[indice]["cls"] = cls print(f"Replace with {bboxes_xywhn.shape[0]} boxes") return - keep_indices=[] + keep_indices = [] for i in range(bboxes_xywhn.shape[0]): - bbox=bboxes_xywhn[i] - max_iou=0 - for j in range(self.labels[indice]['bboxes'].shape[0]): - exist_bbox=self.labels[indice]['bboxes'][j] - box1=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(bbox[np.newaxis,:]).xyxy[0] - box2=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(exist_bbox[np.newaxis,:]).xyxy[0] - xi1=max(box1[0],box2[0]) - yi1=max(box1[1],box2[1]) - xi2=min(box1[2],box2[2]) - yi2=min(box1[3],box2[3]) - inter_area=max(0,xi2-xi1)*max(0,yi2-yi1) - box1_area=(box1[2]-box1[0])*(box1[3]-box1[1]) - box2_area=(box2[2]-box2[0])*(box2[3]-box2[1]) - union_area=box1_area+box2_area-inter_area - current_iou=inter_area/union_area if union_area>0 else 0 - if current_iou>max_iou: - max_iou=current_iou - if max_iou 0 else 0 + if current_iou > max_iou: + max_iou = current_iou + if max_iou < iou: keep_indices.append(i) print(f"Append {len(keep_indices)} new boxes out of {bboxes_xywhn.shape[0]}") for i in keep_indices: - bbox=bboxes_xywhn[i] - c=cls[i] - self.labels[indice]['bboxes']=np.vstack([self.labels[indice]['bboxes'],bbox]) - self.labels[indice]['cls']=np.vstack([self.labels[indice]['cls'],c]) - - def grounding_predict_and_update_labels_batch(self,indices,iou=0.05,replace=False,conf=0.1): - - - results=self.yoloe_predict_batch(indices, conf=conf) - assert len(results)==len(indices), "Mismatch between results and indices length" - for indice,res in zip(indices,results): - self.labels[indice]= self._update_grounding_label(self.labels[indice],res,iou=iou,replace=replace) + bbox = bboxes_xywhn[i] + c = cls[i] + self.labels[indice]["bboxes"] = np.vstack([self.labels[indice]["bboxes"], bbox]) + self.labels[indice]["cls"] = np.vstack([self.labels[indice]["cls"], c]) + def grounding_predict_and_update_labels_batch(self, indices, iou=0.05, replace=False, conf=0.1): + results = self.yoloe_predict_batch(indices, conf=conf) + assert len(results) == len(indices), "Mismatch between results and indices length" + for indice, res in zip(indices, results): + self.labels[indice] = self._update_grounding_label(self.labels[indice], res, iou=iou, replace=replace) def _update_grounding_label(self, label, result_obj, iou=0.1, replace=True): assert self.data_style == "grounding", "_update_grounding_label requires grounding data_style" - boxes=result_obj.boxes - bboxes_xyxy=boxes.xyxy.cpu().numpy() - yolo_box=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy) - bboxes_xywhn=yolo_box.xywhn - cls=boxes.cls.cpu().numpy() + boxes = result_obj.boxes + bboxes_xyxy = boxes.xyxy.cpu().numpy() + yolo_box = YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy) + bboxes_xywhn = yolo_box.xywhn + cls = boxes.cls.cpu().numpy() - assert bboxes_xywhn.shape[0]==cls.shape[0], "Mismatch between number of boxes and classes" + assert bboxes_xywhn.shape[0] == cls.shape[0], "Mismatch between number of boxes and classes" if replace: - label['bboxes']=bboxes_xywhn - label['cls']=cls - print(f"Replace with {bboxes_xywhn.shape[0]} boxes") + label["bboxes"] = bboxes_xywhn + label["cls"] = cls + print(f"Replace with {bboxes_xywhn.shape[0]} boxes") return - keep_indices=[] + keep_indices = [] for i in range(bboxes_xywhn.shape[0]): - bbox=bboxes_xywhn[i] - max_iou=0 - for j in range(label['bboxes'].shape[0]): - exist_bbox=label['bboxes'][j] - box1=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(bbox[np.newaxis,:]).xyxy[0] - box2=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(exist_bbox[np.newaxis,:]).xyxy[0] - xi1=max(box1[0],box2[0]) - yi1=max(box1[1],box2[1]) - xi2=min(box1[2],box2[2]) - yi2=min(box1[3],box2[3]) - inter_area=max(0,xi2-xi1)*max(0,yi2-yi1) - box1_area=(box1[2]-box1[0])*(box1[3]-box1[1]) - box2_area=(box2[2]-box2[0])*(box2[3]-box2[1]) - union_area=box1_area+box2_area-inter_area - current_iou=inter_area/union_area if union_area>0 else 0 - if current_iou>max_iou: - max_iou=current_iou - if max_iou 0 else 0 + if current_iou > max_iou: + max_iou = current_iou + if max_iou < iou: keep_indices.append(i) # get current texts - current_texts= label['texts'] - current_texts= [ text[0] for text in current_texts] # remote the list structure inside - + current_texts = label["texts"] + current_texts = [text[0] for text in current_texts] # remote the list structure inside # get all bbox that should be appended from result_obj with their cls and texts - append_bboxes,append_cls,append_text = [],[],[] + append_bboxes, append_cls, append_text = [], [], [] for i in keep_indices: append_bboxes.append(bboxes_xywhn[i]) append_cls.append(cls[i]) append_text.append(self.names[int(cls[i])]) - - # update the current texts for text in append_text: if text not in current_texts: current_texts.append(text) - label['texts'] = [[text] for text in current_texts] # keep the list structure - + label["texts"] = [[text] for text in current_texts] # keep the list structure # update the append_cls to match the updated texts - updated_append_cls=[] + updated_append_cls = [] for text in append_text: - updated_cls=current_texts.index(text) # + updated_cls = current_texts.index(text) # updated_append_cls.append(updated_cls) - append_cls=updated_append_cls + append_cls = updated_append_cls # format - append_bboxes= np.array(append_bboxes).reshape(-1,4) - append_cls= np.array(append_cls).reshape(-1,1) - - + append_bboxes = np.array(append_bboxes).reshape(-1, 4) + append_cls = np.array(append_cls).reshape(-1, 1) # append the boxes and cls for i in range(append_bboxes.shape[0]): - bbox=append_bboxes[i] - c=append_cls[i] - label['bboxes']=np.vstack([label['bboxes'],bbox]) - label['cls']=np.vstack([label['cls'],c]) + bbox = append_bboxes[i] + c = append_cls[i] + label["bboxes"] = np.vstack([label["bboxes"], bbox]) + label["cls"] = np.vstack([label["cls"], c]) # print how many boxes are appended print(f"Append {append_bboxes.shape[0]} new boxes out of {bboxes_xywhn.shape[0]}") return label + def label_append_instance(self, indice, bboxes, cls, texts=None): + assert len(bboxes) == len(cls), "Length of bboxes and cls must be the same" - - def label_append_instance(self,indice,bboxes,cls,texts=None): - - assert len(bboxes)==len(cls), "Length of bboxes and cls must be the same" - - - - - def visual_and_save2(self, indice=None,filename=None, - save_path="./visualize2.jpg"): + def visual_and_save2(self, indice=None, filename=None, save_path="./visualize2.jpg"): """Visualizes a label using ultralytics.engine.results.Results and saves it.""" - - assert self.data_style in ["grounding","detection"] + assert self.data_style in ["grounding", "detection"] print("Visualizing index:", indice) - if indice is None : + if indice is None: assert filename is not None, "Either indice or filename must be provided" - + for idx, label in enumerate(self.labels): - im_file = label['im_file'] + im_file = label["im_file"] # if hasattr(self, 'img_source'): # im_file = os.path.join(self.img_source, im_file) print(im_file) @@ -440,23 +428,23 @@ def visual_and_save2(self, indice=None,filename=None, label = self.labels[indice] print("label keys:", label.keys()) - im_file = label['im_file'] + im_file = label["im_file"] - if hasattr(self, 'img_source'): + if hasattr(self, "img_source"): im_file = os.path.join(self.img_source, im_file) orig_img = np.array(Image.open(im_file)) img_h, img_w = orig_img.shape[:2] - - bboxes_xywhn = label['bboxes'] - cls = label['cls'] - if self.data_style=="detection": + + bboxes_xywhn = label["bboxes"] + cls = label["cls"] + if self.data_style == "detection": assert self.yaml_config is not None, "yaml_config must be provided for detection data_style" names = self.names - - elif self.data_style=="grounding": - names=label.get('texts',None) - names= [ text[0] for text in names] # remote the list structure inside + + elif self.data_style == "grounding": + names = label.get("texts", None) + names = [text[0] for text in names] # remote the list structure inside if isinstance(names, (list, tuple)): names = {int(i): str(n) for i, n in enumerate(names)} @@ -503,73 +491,62 @@ def visual_and_save2(self, indice=None,filename=None, print("Boxes data shape:", boxes_tensor.shape) # Create Results object - result = Results( - orig_img=np.array(orig_img), - path=im_file, - names=names, - boxes=boxes_tensor - ) - # print each bbox witth cls and name from the result object + result = Results(orig_img=np.array(orig_img), path=im_file, names=names, boxes=boxes_tensor) + # print each bbox with cls and name from the result object for i in range(result.boxes.shape[0]): box = result.boxes[i] cls_id = int(box.cls.item()) cls_name = result.names.get(cls_id, "unknown") print(f"Box {i}: Class ID = {cls_id}, Class Name = {cls_name}, Box Coordinates = {box.xyxy.tolist()}") - + print("Number of boxes in Results object:", len(result.boxes) if result.boxes is not None else 0) if result.boxes: - result.boxes.is_track = False # Set to false to avoid printing track_ids + result.boxes.is_track = False # Set to false to avoid printing track_ids result.save(save_path) - # # Plot the results # im_array = result.plot(conf=False) # conf=False to not show confidence scores - + # # Save the visualized image""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" print(f"Saved visualization to {save_path}") - from tqdm import tqdm +if __name__ == "__main__": + DATA_NAME = "flickr" # -if __name__=="__main__": - - DATA_NAME="flickr" # - - if DATA_NAME=="Objects365v1": - de=DataEngine(device="cuda") - yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml" - cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache" + if DATA_NAME == "Objects365v1": + de = DataEngine(device="cuda") + yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml" + cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache" de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config) de.load_yoloe() - de.set_classes(yaml_config=yaml_config) # set classes for the dataset + de.set_classes(yaml_config=yaml_config) # set classes for the dataset - batch_size=64 - for start in tqdm(range(0,len(de),batch_size)): - batch_indices=list(range(start,min(start+batch_size,len(de)))) - de.detection_predict_and_update_labels_batch(batch_indices,iou=0.1,conf=0.1) + batch_size = 64 + for start in tqdm(range(0, len(de), batch_size)): + batch_indices = list(range(start, min(start + batch_size, len(de)))) + de.detection_predict_and_update_labels_batch(batch_indices, iou=0.1, conf=0.1) de.save_cached_label(save_path=cache_path.replace(".cache", "_updated.cache")) - elif DATA_NAME=="mixed_grounding": - - - # set gpu 3 - device="cuda:1" - de=DataEngine(device=device) - cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache" - text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + elif DATA_NAME == "mixed_grounding": + # set gpu 3 + device = "cuda:1" + de = DataEngine(device=device) + cache_path = ( + "/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache" + ) + text_embed_pt = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.load_yoloe() - batch_size=32 - for start in tqdm(range(1000,len(de),batch_size)): - batch_indices=list(range(start,min(start+batch_size,len(de)))) - batch_texts=[] + batch_size = 32 + for start in tqdm(range(1000, len(de), batch_size)): + batch_indices = list(range(start, min(start + batch_size, len(de)))) + batch_texts = [] for indice in batch_indices: label_texts = de.labels[indice].get("texts", []) if isinstance(label_texts, list): @@ -583,7 +560,7 @@ def visual_and_save2(self, indice=None,filename=None, unique_texts = list(dict.fromkeys(batch_texts)) de.set_classes(name_list=unique_texts) else: - de.set_classes(name_list=None) + de.set_classes(name_list=None) # debug_indice = batch_indices[10] # de.visual_and_save2(debug_indice, save_path="./visualized_grounding_example.jpg") @@ -593,26 +570,23 @@ def visual_and_save2(self, indice=None,filename=None, print(f"Error processing batch starting at index {start}: {e}") # de.visual_and_save2(debug_indice, save_path="./visualized_grounding_example1.jpg") - de.save_cached_label(save_path=cache_path.replace(".cache", ".updated.cache")) - elif DATA_NAME=="flickr": - - + elif DATA_NAME == "flickr": # set gpu 2 - device="cuda:2" - de=DataEngine(device=device) - cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache" - text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + device = "cuda:2" + de = DataEngine(device=device) + cache_path = ( + "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache" + ) + text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.load_yoloe() - batch_size=128 - for start in tqdm(range(1000,len(de),batch_size)): - batch_indices=list(range(start,min(start+batch_size,len(de)))) - batch_texts=[] + batch_size = 128 + for start in tqdm(range(1000, len(de), batch_size)): + batch_indices = list(range(start, min(start + batch_size, len(de)))) + batch_texts = [] for indice in batch_indices: label_texts = de.labels[indice].get("texts", []) if isinstance(label_texts, list): @@ -633,5 +607,4 @@ def visual_and_save2(self, indice=None,filename=None, de.grounding_predict_and_update_labels_batch(batch_indices, iou=0.1, conf=0.1) # de.visual_and_save2(debug_indice, save_path="./visualized_grounding_example1.jpg") - de.save_cached_label(save_path=cache_path.replace(".cache", ".updated.cache")) diff --git a/data_engine_agent.py b/data_engine_agent.py index db84554..c43f342 100644 --- a/data_engine_agent.py +++ b/data_engine_agent.py @@ -1,25 +1,26 @@ -from git import List -from matplotlib.pylab import sample -import ultralytics,os +from __future__ import annotations + +import os + +import ultralytics + workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) -from collections import defaultdict +import copy import json -from concurrent.futures import ProcessPoolExecutor, as_completed -from tqdm import tqdm +import multiprocessing as mp import os -import numpy as np -from pathlib import Path from collections import defaultdict -import multiprocessing as mp -from yoloe_data_engine.data_engine import DataEngine +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path +from pathlib import Path as _Path -import copy import numpy as np -from pathlib import Path as _Path +from tqdm import tqdm +from yoloe_data_engine.data_engine import DataEngine IMAGES_CACHE = None IMNAME_ANNS_CACHE = None @@ -44,6 +45,7 @@ def to_serializable(obj): ######################## Grounding Data Loading Worker ######################## + def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name): """Worker invoked in subprocesses to build per-image grounding labels.""" global IMAGES_CACHE, IMNAME_ANNS_CACHE @@ -54,6 +56,7 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name): return from ultralytics.data.converter import merge_multi_segment from ultralytics.data.dataset import segments2boxes + img = IMAGES_CACHE[f"{imid:d}"] h, w, f = img["height"], img["width"], img["file_name"] im_file = Path(im_dir) / f # Use the passed im_dir @@ -78,14 +81,14 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name): if box[2] <= 0 or box[3] <= 0: continue caption = ann["caption"] - cat_name = " ".join([caption[t[0]:t[1]] for t in ann["tokens_positive"]]).lower().strip() + cat_name = " ".join([caption[t[0] : t[1]] for t in ann["tokens_positive"]]).lower().strip() if not cat_name: continue if cat_name not in cat2id: cat2id[cat_name] = len(cat2id) texts.append([cat_name]) cls = cat2id[cat_name] - box = [cls] + box.tolist() + box = [cls, *box.tolist()] if box not in bboxes: bboxes.append(box) if ann.get("segmentation") is not None: @@ -97,8 +100,12 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name): s = (np.concatenate(s, axis=0) / np.array([w, h], dtype=np.float32)).reshape(-1).tolist() else: s = [j for i in ann["segmentation"] for j in i] - s = (np.array(s, dtype=np.float32).reshape(-1, 2) / np.array([w, h], dtype=np.float32)).reshape(-1).tolist() - s = [cls] + s + s = ( + (np.array(s, dtype=np.float32).reshape(-1, 2) / np.array([w, h], dtype=np.float32)) + .reshape(-1) + .tolist() + ) + s = [cls, *s] segments.append(s) bboxes_xyxy.append(ann["bbox"]) lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32) @@ -117,9 +124,8 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name): "bbox_format": "xywh", "texts": texts, } - def serializeLabel(label): - + def serializeLabel(label): lc = copy.deepcopy(label) lc["im_file"] = str(lc.get("im_file", "")) lc["shape"] = list(lc.get("shape", [])) @@ -135,15 +141,16 @@ def serializeLabel(label): lc["normalized"] = bool(lc.get("normalized", True)) lc["bbox_format"] = str(lc.get("bbox_format", "xywh")) return lc + label_serialized = serializeLabel(label) # tmp_file = str(dst_file) + ".tmp" import json + with open(dst_file, "w") as file: json.dump(label_serialized, file, indent=4, ensure_ascii=False) # os.replace(tmp_file, str(dst_file)) - def worker_wrapper(args): return _load_grounding_data(*args) @@ -154,19 +161,21 @@ def init_worker(images_data, imname_anns_data): IMAGES_CACHE = images_data IMNAME_ANNS_CACHE = imname_anns_data + ################################## multi-processing model prediction ############################################ -def _batch_model_predict_single_process(self,buffer_dir, im_files, **kwargs): - """ - Batch model predict in a single process. This can be a method of DataEngine. + +def _batch_model_predict_single_process(self, buffer_dir, im_files, **kwargs): + """Batch model predict in a single process. This can be a method of DataEngine. + Args: self: DataEngine instance buffer_dir: str, buffer directory to save results im_files: list of str, image file paths - kwargs: other keyword arguments for model.predict + kwargs: other keyword arguments for model.predict. """ assert isinstance(self, DataEngine) - engine=self + engine = self dst_dir = os.path.join(buffer_dir, "model_predict") os.makedirs(dst_dir, exist_ok=True) conf = kwargs.get("conf", 0.5) @@ -178,7 +187,9 @@ def _batch_model_predict_single_process(self,buffer_dir, im_files, **kwargs): print("All images have been processed, skip.") return process_img_files = [im_files[i] for i in indices] - results = list(engine.model.predict(process_img_files, conf=conf, iou=iou, batch=len(process_img_files), stream=True)) + results = list( + engine.model.predict(process_img_files, conf=conf, iou=iou, batch=len(process_img_files), stream=True) + ) print(f"Processed {len(process_img_files)} images.") for i, sample_index in enumerate(indices): sample = Sample() @@ -188,11 +199,9 @@ def _batch_model_predict_single_process(self,buffer_dir, im_files, **kwargs): return - def _device_predict_worker(args): - """ - Worker function for multi-process model prediction on a specific device. - args: tuple containing (device, buffer_dir, batches, kwargs) + """Worker function for multi-process model prediction on a specific device. args: tuple containing (device, + buffer_dir, batches, kwargs). """ device, buffer_dir, batches, kwargs = args @@ -206,21 +215,21 @@ def _device_predict_worker(args): for im_files in tqdm(batches, desc=f"Device {device} processing batches"): _batch_model_predict_single_process(engine, buffer_dir, im_files, **worker_kwargs) return True + + ############################################################################## +def _merge_prediction_to_sample_label(buffer_dir, sample_json, model_predict_json): + """Each sample have a file_name, we merge the model prediction results (model_predict_json) into the sample + grounding label. step 1: first check the filename match, if false, raise error. step 2: check the dst file + exist, if true, skip. step 3: merge model prediction results into sample grounding label, iou score > 0.5 will + be ignored. step 4: save the merged label to buffer_dir/merge_prediction/. -def _merge_prediction_to_sample_label(buffer_dir,sample_json, model_predict_json): - """ - Each sample have a file_name, we merge the model prediction results (model_predict_json) into the sample grounding label. - step 1: first check the filename match, if false, raise error. - step 2: check the dst file exist, if true, skip. - step 3: merge model prediction results into sample grounding label, iou score > 0.5 will be ignored. - step 4: save the merged label to buffer_dir/merge_prediction/ Args: buffer_dir: str, buffer directory to save results sample_json: str, path to sample grounding label json file - model_predict_json: str, path to model prediction json file + model_predict_json: str, path to model prediction json file. """ dst_dir = os.path.join(buffer_dir, "merge_prediction") os.makedirs(dst_dir, exist_ok=True) @@ -238,7 +247,7 @@ def _merge_prediction_to_sample_label(buffer_dir,sample_json, model_predict_json for model_inst in predict_sample.instances: # Defensive: skip invalid model instances - if getattr(model_inst, 'bbox', None) is None: + if getattr(model_inst, "bbox", None) is None: print(f"[merge][WARN] skipping model instance with empty bbox in '{sample_json}'") continue try: @@ -261,6 +270,7 @@ def _merge_prediction_to_sample_label(buffer_dir,sample_json, model_predict_json return True # print(f"Merged label saved to {dst_file}") + def merge_prediction_worker(args): # Support both (idx, buffer_dir, sample_json, model_predict_json) and (buffer_dir, sample_json, model_predict_json) try: @@ -280,13 +290,13 @@ def merge_prediction_worker(args): return _merge_prediction_to_sample_label(buffer_dir, sample_json, model_predict_json) except Exception as e: import traceback + tb = traceback.format_exc() - print(f"[worker][ERROR] idx={idx} file='{sample_json}': {repr(e)}\n{tb}") + print(f"[worker][ERROR] idx={idx} file='{sample_json}': {e!r}\n{tb}") return False - -############################################################################## +############################################################################## class YoloBox: @@ -386,6 +396,7 @@ def iou(self, bbox_xyxy): ious.append(iou) return np.array(ious) + class Instance: def __init__(self, bbox=None, **kwargs): self.bbox = bbox @@ -406,7 +417,7 @@ def set_embed(self, embed): def set_vpe(self, vpe: np.ndarray): self.vpe = vpe.squeeze() - def set_text(self, texts: list, conf: list = None): + def set_text(self, texts: list, conf: list | None = None): self.text = texts self.conf = conf assert len(texts) == len(conf) @@ -418,20 +429,22 @@ def get_top_text_conf(self): def to_dict(self): return { - 'bbox': to_serializable(self.bbox), - 'text': to_serializable(self.text), - 'conf': to_serializable(self.conf), - 'embed': to_serializable(self.embed), - 'vp': to_serializable(self.vpe), - 'other_data': to_serializable(self.other_data) + "bbox": to_serializable(self.bbox), + "text": to_serializable(self.text), + "conf": to_serializable(self.conf), + "embed": to_serializable(self.embed), + "vp": to_serializable(self.vpe), + "other_data": to_serializable(self.other_data), } + def from_dict(self, data: dict): - self.bbox = data.get('bbox') - self.text = data.get('text') - self.conf = data.get('conf') - self.embed = data.get('embed') - self.vpe = data.get('vpe') - self.other_data = data.get('other_data', {}) + self.bbox = data.get("bbox") + self.text = data.get("text") + self.conf = data.get("conf") + self.embed = data.get("embed") + self.vpe = data.get("vpe") + self.other_data = data.get("other_data", {}) + class Sample: def __init__(self): @@ -445,7 +458,8 @@ def load_from_grounding_label(self, grounding_data): if isinstance(grounding_data, str): assert grounding_data.endswith(".json"), "If grounding_data is str, it should be a json file path." import json - with open(grounding_data, 'r') as f: + + with open(grounding_data) as f: grounding_data = json.load(f) assert isinstance(grounding_data, dict), "grounding_data should be a dict" @@ -465,7 +479,9 @@ def load_from_grounding_label(self, grounding_data): self.other_data["normalized"] = normalized assert normalized is True # assert bbox_format == "xywhn" - for cls, box, segment in zip(grounding_data.get("cls", []), grounding_data.get("bboxes", []), grounding_data.get("segments", [])): + for cls, box, segment in zip( + grounding_data.get("cls", []), grounding_data.get("bboxes", []), grounding_data.get("segments", []) + ): # Convert normalized xywh to xyxy for internal consistency bbox_xyxy = YoloBox(self.shape).load_from_xywhn_normalized(np.array([box], dtype=np.float32)).xyxy[0] # Create instance with xyxy bbox @@ -506,14 +522,14 @@ def load_from_grounding_label(self, grounding_data): # return grounding_data def load_from_yoloe_result(self, yoloe_result): - if isinstance(yoloe_result, str): assert yoloe_result.endswith(".json"), "If yoloe_result is str, it should be a json file path." import json - with open(yoloe_result, 'r') as f: + + with open(yoloe_result) as f: yoloe_result = json.load(f) assert isinstance(yoloe_result, dict), "yoloe_result should be a dict" - + self.instances = [] self.im_file = yoloe_result.get("im_file") self.shape = (yoloe_result.get("orig_shape", [0, 0])[0], yoloe_result.get("orig_shape", [0, 0])[1]) @@ -535,30 +551,31 @@ def load_from_yoloe_result(self, yoloe_result): def to_dict(self): return { - 'im_file': to_serializable(self.im_file), - 'instances': [inst.to_dict() for inst in self.instances], - 'other_data': to_serializable(self.other_data) + "im_file": to_serializable(self.im_file), + "instances": [inst.to_dict() for inst in self.instances], + "other_data": to_serializable(self.other_data), } - - def save_to_json(self, json_path): import json - with open(json_path, 'w') as f: + + with open(json_path, "w") as f: json.dump(self.to_dict(), f, indent=4) # print(f"Saved sample to {json_path}") - def load_from_json(self,json_path): + def load_from_json(self, json_path): import json - with open(json_path, 'r') as f: + + with open(json_path) as f: data = json.load(f) - self.im_file = data.get('im_file') + self.im_file = data.get("im_file") self.instances = [] - for inst_data in data.get('instances', []): + for inst_data in data.get("instances", []): inst = Instance() inst.from_dict(inst_data) self.instances.append(inst) - self.other_data = data.get('other_data', {}) + self.other_data = data.get("other_data", {}) + class DataEngineAgent: def __init__(self, devices=["cuda:0"], buffer_dir="/root/ultra_louis_work/engine_buffer"): @@ -582,10 +599,6 @@ def set_classes(self, texts: list | None): model.set_classes(name_list=texts) self.texts = texts - - - - def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0.4, batch_size=3, max_workers=None): im_files = [] for file_name in os.listdir(im_dir): @@ -594,7 +607,7 @@ def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0. # im_files=im_files[:128] print(f"Total images to process: {len(im_files)}") - batches = [im_files[i:i+batch_size] for i in range(0, len(im_files), batch_size)] + batches = [im_files[i : i + batch_size] for i in range(0, len(im_files), batch_size)] print(f"Total batches: {len(batches)}, batch size: {batch_size}") if not batches: return [] @@ -615,7 +628,7 @@ def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0. assigned_batches = batches[idx::device_count] if not assigned_batches: continue - kwargs = {'conf': conf, 'iou': iou, 'texts': texts} + kwargs = {"conf": conf, "iou": iou, "texts": texts} process_args.append((device, self.buffer_dir, assigned_batches, kwargs)) if not process_args: @@ -629,13 +642,10 @@ def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0. for future in tqdm(as_completed(futures), total=len(futures), desc="Model predict ..."): future.result() return results - - # print(f"Saved sample to {dst_file}") def multi_process_load_grounding_data(self, im_dir, json_file, merge_within_one_image, max_workers=8): - print("Start multi-process loading of grounding data...") self.im_dir = im_dir with open(json_file) as f: @@ -659,7 +669,7 @@ def multi_process_load_grounding_data(self, im_dir, json_file, merge_within_one_ imid_anns[ann["image_id"]].append(ann) self.img_path = annotations.get("img_path", "") imids = list(imid_anns.keys()) - + print(f"Total images to process: {len(imids)}") init_args = (images_data, imname_anns_data) @@ -673,25 +683,30 @@ def multi_process_load_grounding_data(self, im_dir, json_file, merge_within_one_ chunk_size = max(1, min(500, len(imids) // (worker_count * 4) if worker_count > 0 else 1)) print(f"Using {worker_count} workers and chunksize: {chunk_size}") - list(tqdm(executor.map(worker_wrapper, tasks, chunksize=chunk_size), total=len(tasks), desc="Loading grounding data")) + list( + tqdm( + executor.map(worker_wrapper, tasks, chunksize=chunk_size), + total=len(tasks), + desc="Loading grounding data", + ) + ) print("Finished loading grounding data.") - def multi_process_merge_prediction(self,json_dir,predict_json_dir,max_workers=8): - - json_files= [] + def multi_process_merge_prediction(self, json_dir, predict_json_dir, max_workers=8): + json_files = [] predict_json_files = [] for sample_file_name in os.listdir(json_dir): if sample_file_name.endswith(".json"): - json_path= os.path.join(json_dir, sample_file_name) + json_path = os.path.join(json_dir, sample_file_name) json_files.append(json_path) # read json_path and get im_file name - with open(json_path, 'r') as f: + with open(json_path) as f: sample_data = json.load(f) im_file = sample_data.get("im_file") im_name = os.path.splitext(os.path.basename(im_file))[0] - predict_json_path= os.path.join(predict_json_dir, f"{im_name}.json") + predict_json_path = os.path.join(predict_json_dir, f"{im_name}.json") if os.path.exists(predict_json_path): predict_json_files.append(predict_json_path) else: @@ -738,78 +753,72 @@ def multi_process_merge_prediction(self,json_dir,predict_json_dir,max_workers=8) if total % 10000 == 0: print(f"[merge] Progress: {ok}/{total} succeeded") print(f"[merge] Done: {ok}/{total} succeeded") - def _merge_predict(self): pass - - def read_numpy_and_print(path=None): def load_dataset_cache_file(path: Path) -> dict: import gc + gc.disable() cache = np.load(str(path), allow_pickle=True).item() gc.enable() return cache + path = "/root/ultra_louis_work/engine_buffer/grounding_data/5.cache" data = load_dataset_cache_file(path) print(data) -if __name__ == "__main__": - - devices = ["cuda:0","cuda:1","cuda:2","cuda:3"] +if __name__ == "__main__": + devices = ["cuda:0", "cuda:1", "cuda:2", "cuda:3"] # agent = DataEngineAgent(devices=devices, buffer_dir="/root/ultra_louis_work/runs/flickr_engine_buffer") # json_file = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.json" # im_dir = "../datasets/flickr/full_images/" # mobileclip_text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" + DATA = "mixed_grounding" # "mixed_grounding" - DATA="mixed_grounding" # "mixed_grounding" - - if DATA=="flickr": - - + if DATA == "flickr": agent = DataEngineAgent(devices=devices, buffer_dir="/root/ultra_louis_work/runs/flickr_engine_buffer") json_file = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.json" im_dir = "../datasets/flickr/full_images/" # mobileclip_text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - mobileclip_text_embed_pt=r"/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - + mobileclip_text_embed_pt = ( + r"/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" + ) + import torch + txt_map = torch.load(mobileclip_text_embed_pt, map_location="cuda:0") - name_list = list (txt_map.keys())[:50000] + name_list = list(txt_map.keys())[:50000] # agent.multi_process_batch_model_predict(im_dir=im_dir, texts=name_list, conf=0.5, iou=0.4, batch_size=2) # agent.multi_process_load_grounding_data(json_file=json_file, im_dir=im_dir, merge_within_one_image=True, max_workers=8) - agent.multi_process_merge_prediction(json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/grounding_data_merged", - predict_json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict", - max_workers=8) - - elif DATA=="mixed_grounding": - + agent.multi_process_merge_prediction( + json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/grounding_data_merged", + predict_json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict", + max_workers=8, + ) + elif DATA == "mixed_grounding": agent = DataEngineAgent(devices=devices, buffer_dir="/root/ultra_louis_work/runs/mixed_engine_buffer") - json_file= "../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json" - im_dir="../datasets/mixed_grounding/gqa/images" + json_file = "../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json" + im_dir = "../datasets/mixed_grounding/gqa/images" mobileclip_text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - + # import torch # txt_map= torch.load(mobileclip_text_embed_pt, map_location="cuda:0") # name_list=list(txt_map.keys())[:50000] # agent.multi_process_batch_model_predict(im_dir=im_dir, texts=name_list, conf=0.5, iou=0.4,batch_size=2) - # agent.multi_process_load_grounding_data(json_file=json_file, im_dir=im_dir, merge_within_one_image=True, max_workers=8) - agent.multi_process_merge_prediction(json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/grounding_data_merged", - predict_json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict", - max_workers=8) + agent.multi_process_merge_prediction( + json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/grounding_data_merged", + predict_json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict", + max_workers=8, + ) # agent.multi_process_load_grounding_data(json_file=json_file, im_dir=im_dir, merge_within_one_image=True, max_workers=8) - - - - - diff --git a/data_visual.py b/data_visual.py index b8f6630..c6db357 100644 --- a/data_visual.py +++ b/data_visual.py @@ -1,4 +1,7 @@ -import ultralytics,os +import os + +import ultralytics + workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) @@ -6,46 +9,35 @@ from data_engine import DataEngine - - - - if __name__ == "__main__": - - # device="cuda:1" # de=DataEngine(device=device) # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache" # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - # de.load_cached_label(cache_path=cache_path, - # data_style="grounding", + # de.load_cached_label(cache_path=cache_path, + # data_style="grounding", # text_embed_pt=text_embed_pt) - im_index=0 - + im_index = 0 - de=DataEngine() - cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache" - text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + de = DataEngine() + cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache" + text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.load_yoloe() print("length of labels:", len(de.labels)) print(de.labels[im_index]["im_file"]) # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg") - - de=DataEngine() - cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.updated.cache" - text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + de = DataEngine() + cache_path = ( + "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.updated.cache" + ) + text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) print("length of labels:", len(de.labels)) # de.load_yoloe() print(de.labels[im_index]["im_file"]) # de.visual_and_save2(im_index, save_path="./visualized_grounding_example1.jpg") - diff --git a/data_visual_flickr.py b/data_visual_flickr.py index 5d94a3d..b10405b 100644 --- a/data_visual_flickr.py +++ b/data_visual_flickr.py @@ -1,4 +1,7 @@ -import ultralytics,os +import os + +import ultralytics + workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) @@ -6,41 +9,29 @@ from data_engine import DataEngine - - - - if __name__ == "__main__": - - # device="cuda:1" # de=DataEngine(device=device) # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache" # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - # de.load_cached_label(cache_path=cache_path, - # data_style="grounding", + # de.load_cached_label(cache_path=cache_path, + # data_style="grounding", # text_embed_pt=text_embed_pt) - im_index=0 + im_index = 0 - - de=DataEngine() - cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache" - text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + de = DataEngine() + cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache" + text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.print_data_info() # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg") - - de=DataEngine() - cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache" - text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + de = DataEngine() + cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache" + text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.print_data_info() # de.visual_and_save2(im_index, save_path="./visualized_grounding_example1.jpg") diff --git a/data_visual_mixed.py b/data_visual_mixed.py index 788795e..3072676 100644 --- a/data_visual_mixed.py +++ b/data_visual_mixed.py @@ -1,48 +1,41 @@ -import ultralytics,os +import os + +import ultralytics + workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) from data_engine import DataEngine -import numpy as np - - - if __name__ == "__main__": - - # device="cuda:1" # de=DataEngine(device=device) # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache" # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - # de.load_cached_label(cache_path=cache_path, - # data_style="grounding", + # de.load_cached_label(cache_path=cache_path, + # data_style="grounding", # text_embed_pt=text_embed_pt) - im_index=0 - + im_index = 0 # de=DataEngine() # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache" # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - # de.load_cached_label(cache_path=cache_path, - # data_style="grounding", + # de.load_cached_label(cache_path=cache_path, + # data_style="grounding", # text_embed_pt=text_embed_pt) # de.print_data_info() # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg") - - de=DataEngine() - cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache" - text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + de = DataEngine() + cache_path = ( + "/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache" + ) + text_embed_pt = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.print_data_info() de.visual_and_save2(filename="353913.jpg", save_path="./visualized_grounding_example_v2.jpg") - - diff --git a/data_visual_object365.py b/data_visual_object365.py index d25cf2d..8a745d0 100644 --- a/data_visual_object365.py +++ b/data_visual_object365.py @@ -1,4 +1,7 @@ -import ultralytics,os +import os + +import ultralytics + workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) @@ -6,35 +9,28 @@ from data_engine import DataEngine - - - - if __name__ == "__main__": - - # device="cuda:1" # de=DataEngine(device=device) # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache" # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - # de.load_cached_label(cache_path=cache_path, - # data_style="grounding", + # de.load_cached_label(cache_path=cache_path, + # data_style="grounding", # text_embed_pt=text_embed_pt) - im_index=0 + im_index = 0 - - de=DataEngine(device="cuda") - de=DataEngine(device="cuda") - yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml" - cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache" + de = DataEngine(device="cuda") + de = DataEngine(device="cuda") + yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml" + cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache" de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config) de.print_data_info() # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg") - de=DataEngine(device="cuda") - yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml" - cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache" + de = DataEngine(device="cuda") + yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml" + cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache" de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config) - de.print_data_info() \ No newline at end of file + de.print_data_info() diff --git a/do_flickr.sh b/do_flickr.sh index 437196c..a9f1a80 100644 --- a/do_flickr.sh +++ b/do_flickr.sh @@ -1,17 +1,14 @@ -# activate clipenv conda env +# activate clipenv conda env source ~/miniconda3/etc/profile.d/conda.sh conda activate clipenv - # remove /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A if it exists if [ -f /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A ]; then - rm /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A + rm /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A fi - # run the refine_text.py script to generate refined labels and cache for Flickr dataset -python3 yoloe_data_engine/refine_text.py +python3 yoloe_data_engine/refine_text.py # run data visualization script -python3 yoloe_data_engine/data_visual_flickr.py - +python3 yoloe_data_engine/data_visual_flickr.py diff --git a/do_mixed.sh b/do_mixed.sh index 898043a..f38e5c8 100644 --- a/do_mixed.sh +++ b/do_mixed.sh @@ -1,16 +1,10 @@ -# activate clipenv conda env +# activate clipenv conda env source ~/miniconda3/etc/profile.d/conda.sh conda activate clipenv - -# set gpu id to 2,3 +# set gpu id to 2,3 export CUDA_VISIBLE_DEVICES=2,3 - - - # run the refine_text.py script to generate refined labels and cache for Flickr dataset -python3 yoloe_data_engine/refine_text.py --img_path /root/ultra_louis_work/datasets/mixed_grounding/gqa/images \ ---json_file /root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json - - +python3 yoloe_data_engine/refine_text.py --img_path /root/ultra_louis_work/datasets/mixed_grounding/gqa/images \ + --json_file /root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json diff --git a/grounding_dataset_visualizer.py b/grounding_dataset_visualizer.py index adc3d11..841d85d 100644 --- a/grounding_dataset_visualizer.py +++ b/grounding_dataset_visualizer.py @@ -1,19 +1,17 @@ +import os -from ultralytics import YOLOE -from ultralytics.models.yolo.yoloe import YOLOEVPTrainer +import ultralytics -import ultralytics,os workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) +import colorsys + +import cv2 import matplotlib.pyplot as plt -import matplotlib.patches as patches import numpy as np -from PIL import Image -import cv2 -import colorsys try: from ultralytics.data import GroundingDataset @@ -23,14 +21,13 @@ class DatasetVisualizer: - def __init__(self, *args, **kwargs): # Initialize with dataset parameters super().__init__(*args, **kwargs) self.colors = self._generate_colors(100) # Generate colors for different classes - + def _generate_colors(self, num_colors): - """Generate distinct colors for visualization""" + """Generate distinct colors for visualization.""" colors = [] for i in range(num_colors): hue = i / num_colors @@ -38,77 +35,75 @@ def _generate_colors(self, num_colors): rgb = colorsys.hsv_to_rgb(hue, 0.8, 0.9) colors.append([int(c * 255) for c in rgb]) return colors - + def _convert_tensor_to_numpy(self, data): - """Convert PyTorch tensor to numpy array if needed""" - if hasattr(data, 'cpu'): # PyTorch tensor + """Convert PyTorch tensor to numpy array if needed.""" + if hasattr(data, "cpu"): # PyTorch tensor return data.cpu().numpy() elif isinstance(data, np.ndarray): return data else: return np.array(data) - + def _truncate_text(self, text, max_length=20): - """Truncate text for display if too long""" + """Truncate text for display if too long.""" if isinstance(text, str) and len(text) > max_length: - return text[:max_length-3] + "..." + return text[: max_length - 3] + "..." return str(text) - + def _draw_bbox_on_image(self, image, bbox, label, color, confidence=None): - """Draw bounding box and label on image""" + """Draw bounding box and label on image.""" if isinstance(image, np.ndarray): img = image.copy() else: img = np.array(image) - + # Convert bbox format - assuming YOLO format [center_x, center_y, width, height] center_x, center_y, width, height = bbox - + # Convert to xyxy format x1 = center_x - width / 2 y1 = center_y - height / 2 x2 = center_x + width / 2 y2 = center_y + height / 2 - + # Check if coordinates are valid if x1 >= x2 or y1 >= y2: return img - + # Check if coordinates are within image bounds h, w = img.shape[:2] - + # Convert normalized coordinates to pixel coordinates if needed if x2 <= 1.0: # Normalized coordinates x1, x2 = x1 * w, x2 * w y1, y2 = y1 * h, y2 * h - + # Ensure coordinates are within image bounds - x1 = max(0, min(w-1, x1)) - y1 = max(0, min(h-1, y1)) - x2 = max(0, min(w-1, x2)) - y2 = max(0, min(h-1, y2)) - + x1 = max(0, min(w - 1, x1)) + y1 = max(0, min(h - 1, y1)) + x2 = max(0, min(w - 1, x2)) + y2 = max(0, min(h - 1, y2)) + # Draw rectangle cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), color, 2) - + # Prepare label text (truncate if too long) label_text = self._truncate_text(label, max_length=15) if confidence is not None: label_text += f" {confidence:.2f}" - + # Draw label background (text_width, text_height), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1) - cv2.rectangle(img, (int(x1), int(y1) - text_height - 10), - (int(x1) + text_width, int(y1)), color, -1) - + cv2.rectangle(img, (int(x1), int(y1) - text_height - 10), (int(x1) + text_width, int(y1)), color, -1) + # Draw label text - cv2.putText(img, label_text, (int(x1), int(y1) - 5), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1) - + cv2.putText(img, label_text, (int(x1), int(y1) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1) + return img def visualize(self, idx: int): - """Visualize a single sample from the dataset""" + """Visualize a single sample from the dataset.""" # Get sample data sample = self.__getitem__(idx) print(f"Sample keys: {list(sample.keys()) if isinstance(sample, dict) else 'Not a dict'}") @@ -119,22 +114,22 @@ def visualize(self, idx: int): # Extract image, bboxes, labels, and texts from the sample if isinstance(sample, dict): - image = sample.get('img', sample.get('image')) - bboxes = sample.get('bboxes', sample.get('bbox')) - labels = sample.get('cls', sample.get('labels', sample.get('classes'))) - texts = sample.get('texts', sample.get('text', [])) # Extract text labels + image = sample.get("img", sample.get("image")) + bboxes = sample.get("bboxes", sample.get("bbox")) + labels = sample.get("cls", sample.get("labels", sample.get("classes"))) + texts = sample.get("texts", sample.get("text", [])) # Extract text labels else: # If sample is a tuple/list (image, target) image, target = sample if isinstance(target, dict): - bboxes = target.get('bboxes', target.get('bbox')) - labels = target.get('cls', target.get('labels', target.get('classes'))) - texts = target.get('texts', target.get('text', [])) + bboxes = target.get("bboxes", target.get("bbox")) + labels = target.get("cls", target.get("labels", target.get("classes"))) + texts = target.get("texts", target.get("text", [])) else: bboxes, labels, texts = target, None, [] - + print(f"Found {len(texts) if texts else 0} text labels: {texts[:3] if texts else 'None'}") - + # Convert tensors to numpy arrays if image is not None: image = self._convert_tensor_to_numpy(image) @@ -142,7 +137,7 @@ def visualize(self, idx: int): bboxes = self._convert_tensor_to_numpy(bboxes) if labels is not None: labels = self._convert_tensor_to_numpy(labels) - + # Convert image if needed if isinstance(image, np.ndarray): if len(image.shape) == 3 and image.shape[0] == 3: # CHW format @@ -153,21 +148,21 @@ def visualize(self, idx: int): # Ensure proper data type if image.dtype != np.uint8: image = image.astype(np.uint8) - + # Handle different bbox and label formats if bboxes is not None and len(bboxes) > 0: # Flatten labels if they have extra dimensions if labels is not None and len(labels.shape) > 1: labels = labels.flatten() - + print(f"texts: {texts}") print(f"labels: {labels}") - + # Draw bboxes on image viz_image = image.copy() for i, bbox in enumerate(bboxes): color = self.colors[i % len(self.colors)] - + # Use class index to get text label from texts array if labels is not None and i < len(labels) and texts: class_idx = int(labels[i]) @@ -184,11 +179,11 @@ def visualize(self, idx: int): label = f"class_{int(labels[i])}" else: label = f"obj_{i}" - + viz_image = self._draw_bbox_on_image(viz_image, bbox, label, color) else: viz_image = image - + # Display using matplotlib plt.figure(figsize=(12, 8)) if len(viz_image.shape) == 3: @@ -198,53 +193,51 @@ def visualize(self, idx: int): else: plt.imshow(viz_image) else: - plt.imshow(viz_image, cmap='gray') + plt.imshow(viz_image, cmap="gray") plt.title(f"Sample {idx}") - plt.axis('off') + plt.axis("off") plt.tight_layout() plt.show() - + return viz_image def batch_visualize(self, indices: list): - """ - Show multiple samples in matplotlib subplots - """ + """Show multiple samples in matplotlib subplots.""" n_samples = len(indices) if n_samples == 1: self.visualize(indices[0]) return - + # Calculate subplot layout cols = min(3, n_samples) rows = (n_samples + cols - 1) // cols - - fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 5*rows)) + + _fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 5 * rows)) if rows == 1: axes = [axes] if cols == 1 else axes else: axes = axes.flatten() if n_samples > 1 else [axes] - + for i, idx in enumerate(indices): # Get sample data sample = self.__getitem__(idx) - + # Extract image, bboxes, labels, and texts from the sample if isinstance(sample, dict): - image = sample.get('img', sample.get('image')) - bboxes = sample.get('bboxes', sample.get('bbox')) - labels = sample.get('cls', sample.get('labels', sample.get('classes'))) - texts = sample.get('texts', sample.get('text', [])) # Extract text labels + image = sample.get("img", sample.get("image")) + bboxes = sample.get("bboxes", sample.get("bbox")) + labels = sample.get("cls", sample.get("labels", sample.get("classes"))) + texts = sample.get("texts", sample.get("text", [])) # Extract text labels else: # If sample is a tuple/list (image, target) image, target = sample if isinstance(target, dict): - bboxes = target.get('bboxes', target.get('bbox')) - labels = target.get('cls', target.get('labels', target.get('classes'))) - texts = target.get('texts', target.get('text', [])) + bboxes = target.get("bboxes", target.get("bbox")) + labels = target.get("cls", target.get("labels", target.get("classes"))) + texts = target.get("texts", target.get("text", [])) else: bboxes, labels, texts = target, None, [] - + # Convert tensors to numpy arrays if image is not None: image = self._convert_tensor_to_numpy(image) @@ -252,7 +245,7 @@ def batch_visualize(self, indices: list): bboxes = self._convert_tensor_to_numpy(bboxes) if labels is not None: labels = self._convert_tensor_to_numpy(labels) - + # Convert image if needed if isinstance(image, np.ndarray): if len(image.shape) == 3 and image.shape[0] == 3: # CHW format @@ -263,18 +256,18 @@ def batch_visualize(self, indices: list): # Ensure proper data type if image.dtype != np.uint8: image = image.astype(np.uint8) - + # Handle different bbox and label formats if bboxes is not None and len(bboxes) > 0: # Flatten labels if they have extra dimensions if labels is not None and len(labels.shape) > 1: labels = labels.flatten() - + # Draw bboxes on image viz_image = image.copy() for j, bbox in enumerate(bboxes): color = self.colors[j % len(self.colors)] - + # Use class index to get text label from texts array if labels is not None and j < len(labels) and texts: class_idx = int(labels[j]) @@ -291,11 +284,11 @@ def batch_visualize(self, indices: list): label = f"class_{int(labels[j])}" else: label = f"obj_{j}" - + viz_image = self._draw_bbox_on_image(viz_image, bbox, label, color) else: viz_image = image - + # Display in subplot ax = axes[i] if n_samples > 1 else axes[0] if len(viz_image.shape) == 3: @@ -305,139 +298,130 @@ def batch_visualize(self, indices: list): else: ax.imshow(viz_image) else: - ax.imshow(viz_image, cmap='gray') + ax.imshow(viz_image, cmap="gray") ax.set_title(f"Sample {idx}") - ax.axis('off') - + ax.axis("off") + # Hide unused subplots for i in range(n_samples, len(axes)): - axes[i].axis('off') - + axes[i].axis("off") + plt.tight_layout() plt.show() def random_visualize(self, n=5): import random + indices = random.sample(range(len(self)), n) self.batch_visualize(indices) - + def save_visualization(self, idx: int, save_path: str): - """Save visualization to file""" + """Save visualization to file.""" viz_image = self.visualize(idx) if viz_image is not None: cv2.imwrite(save_path, viz_image) print(f"Visualization saved to {save_path}") - + def get_dataset_info(self): - """Get basic information about the dataset""" + """Get basic information about the dataset.""" try: print(f"Dataset length: {len(self)}") - + # Sample first item to understand structure sample = self.__getitem__(0) print(f"Sample type: {type(sample)}") - + if isinstance(sample, dict): print(f"Sample keys: {list(sample.keys())}") - if 'img' in sample: + if "img" in sample: print(f"Image shape: {sample['img'].shape}") - if 'bboxes' in sample: + if "bboxes" in sample: print(f"Number of bboxes: {len(sample['bboxes'])}") - if 'cls' in sample: + if "cls" in sample: print(f"Classes: {sample['cls']}") else: print(f"Sample structure: {[type(x) for x in sample]}") - + except Exception as e: print(f"Error getting dataset info: {e}") - + def visualize_class_distribution(self, max_samples=1000): - """Visualize the distribution of classes in the dataset""" + """Visualize the distribution of classes in the dataset.""" class_counts = {} n_samples = min(len(self), max_samples) - + for i in range(n_samples): try: sample = self.__getitem__(i) labels = None - + if isinstance(sample, dict): - labels = sample.get('cls', sample.get('labels', sample.get('classes'))) + labels = sample.get("cls", sample.get("labels", sample.get("classes"))) else: _, target = sample if isinstance(target, dict): - labels = target.get('cls', target.get('labels', target.get('classes'))) - + labels = target.get("cls", target.get("labels", target.get("classes"))) + if labels is not None: # Convert tensor to numpy if needed labels = self._convert_tensor_to_numpy(labels) # Flatten if needed if len(labels.shape) > 1: labels = labels.flatten() - + for label in labels: class_counts[str(int(label))] = class_counts.get(str(int(label)), 0) + 1 - + except Exception as e: print(f"Error processing sample {i}: {e}") continue - + # Plot distribution if class_counts: plt.figure(figsize=(12, 6)) classes = list(class_counts.keys()) counts = list(class_counts.values()) - + plt.bar(classes[:20], counts[:20]) # Show top 20 classes - plt.xlabel('Class') - plt.ylabel('Count') - plt.title('Class Distribution (Top 20)') + plt.xlabel("Class") + plt.ylabel("Count") + plt.title("Class Distribution (Top 20)") plt.xticks(rotation=45) plt.tight_layout() plt.show() - + print(f"Total classes found: {len(class_counts)}") print(f"Most frequent classes: {sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:10]}") else: print("No class information found in dataset") - - - class GroundingDatasetVisualizer(GroundingDataset, DatasetVisualizer): - pass - if __name__ == "__main__": - - - - img_path="../datasets/mixed_grounding/gqa/images", - json_file="../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json", + img_path = ("../datasets/mixed_grounding/gqa/images",) + json_file = ("../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json",) visualizer = GroundingDatasetVisualizer(json_file=json_file, img_path=img_path, augment=False) # Get dataset information print("=== Dataset Information ===") visualizer.get_dataset_info() - + # Visualize single sample to test text labls print("\n=== Single Sample Visualization ===") # visualizer.visualize(1) - + # Uncomment these for more visualizations: # print("\n=== Batch Visualization ===") - visualizer.batch_visualize([0,1,2,3,4]) - + visualizer.batch_visualize([0, 1, 2, 3, 4]) + # Random visualization print("\n=== Random Visualization ===") # visualizer.random_visualize(n=3) - + # Class distribution analysis print("\n=== Class Distribution ===") # visualizer.visualize_class_distribution(max_samples=100) - - diff --git a/log.md b/log.md index 50c2f64..3ea4ad4 100644 --- a/log.md +++ b/log.md @@ -1,8 +1,4 @@ - - - - -# flickr +# flickr ``` Load text embed from: /root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt @@ -18,10 +14,9 @@ Total number of boxes: 638214 # Object365 - ``` -(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_object365.py +(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_object365.py set workspace: /root/ultra_louis_work/ultralytics set workspace: /root/ultra_louis_work/ultralytics 608606 @@ -36,8 +31,7 @@ Total number of boxes: 15518179 # mixed - -(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_mixed.py +(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_mixed.py set workspace: /root/ultra_louis_work/ultralytics set workspace: /root/ultra_louis_work/ultralytics 46380 @@ -48,4 +42,4 @@ Total number of boxes: 2245337 46380 Load text embed from: /root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt Data style: grounding -Total number of labels: 46380 \ No newline at end of file +Total number of labels: 46380 diff --git a/readme.md b/readme.md index 0802e8c..b1cd38e 100644 --- a/readme.md +++ b/readme.md @@ -1,33 +1,27 @@ +# pipeline of the data engine +### read the grounding data from the JSON file - -# pipline of the data engine -### read the grounding data from the JSON file for each sample, per-store the others samples sharing the same image. - add the - -### model predict and save the JSON files -- visual to check the json files -- found that some boxes are overlapped heavily, with different text -- how to deal with these boxes? - -### merge model prediction to label, -- discard the bbox with higher iou ( > 0.8, higher iou, no consider the class or text) + add the +### model predict and save the JSON files +- visual to check the json files +- found that some boxes are overlapped heavily, with different text +- how to deal with these boxes? +### merge model prediction to label, +- discard the bbox with higher iou ( > 0.8, higher iou, no consider the class or text) -- generate the visual prompt embedding for each instance (bbox) - - -- merge bboxes within the same image ( consider the vpe distance and text similarity,bbox iou<0.8 ) - +- generate the visual prompt embedding for each instance (bbox) -- transfer to grounding format cache for training +- merge bboxes within the same image ( consider the vpe distance and text similarity,bbox iou<0.8 ) -- +- transfer to grounding format cache for training +- to do: diff --git a/refine_text.py b/refine_text.py index 85b401d..1218f66 100644 --- a/refine_text.py +++ b/refine_text.py @@ -1,40 +1,44 @@ -import ultralytics,os +import os + +import ultralytics + workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) -from numpy import indices -from pyparsing import annotations -from data_engine import * - -from ultralytics.data.dataset import GroundingDataset, DATASET_CACHE_VERSION, save_dataset_cache_file, get_hash,CACHE_SUFFIX,segments2boxes -from ultralytics.data.converter import merge_multi_segment +import json from collections import defaultdict -from ultralytics.utils import LOCAL_RANK, LOGGER, NUM_THREADS, TQDM, colorstr from typing import Any -import json -from ultralytics.models import yolo +from ultralytics.data.converter import merge_multi_segment +from ultralytics.data.dataset import ( + DATASET_CACHE_VERSION, + GroundingDataset, + get_hash, + save_dataset_cache_file, + segments2boxes, +) +from ultralytics.models import yolo +from ultralytics.utils import LOGGER, TQDM -class RefineGroundingDataset(GroundingDataset, DataEngine): +from data_engine import * +class RefineGroundingDataset(GroundingDataset, DataEngine): + def vpe_text(self, source, visual_prompts, texts): + """Cal the visual prompt embedding for the current image and visual prompts. - def vpe_text(self, source, visual_prompts ,texts): - """ - cal the visual prompt embedding for the current image and visual prompts. Args: source: image source visual_prompts: dict, containing "bboxes" and "cls" lists texts: list of str, the texts to be matched + Returns: - matched texts for each box: tensor, (N,) + matched texts for each box: tensor, (N,). """ - - - yoloe_model= self.model - predictor=yolo.yoloe.YOLOEVPDetectPredictor + yoloe_model = self.model + predictor = yolo.yoloe.YOLOEVPDetectPredictor if type(yoloe_model.predictor) is not predictor: yoloe_model.predictor = predictor( overrides={ @@ -46,12 +50,10 @@ def vpe_text(self, source, visual_prompts ,texts): _callbacks=yoloe_model.callbacks, ) # self.task = "segment" if isinstance(self.predictor, yolo.segment.SegmentationPredictor) else "detect" - - # get the vpe from current image and visual prompts - prompts={"bboxes": visual_prompts["bboxes"], - "cls":list( range( len(visual_prompts["cls"])))} - num_cls= len(set(prompts["cls"])) + # get the vpe from current image and visual prompts + prompts = {"bboxes": visual_prompts["bboxes"], "cls": list(range(len(visual_prompts["cls"])))} + num_cls = len(set(prompts["cls"])) yoloe_model.model.model[-1].nc = num_cls yoloe_model.model.model[-1].no = num_cls + yoloe_model.model.model[-1].reg_max * 4 yoloe_model.model.names = [f"object{i}" for i in range(num_cls)] @@ -59,24 +61,19 @@ def vpe_text(self, source, visual_prompts ,texts): yoloe_model.predictor.setup_model(model=yoloe_model.model) vpe = yoloe_model.predictor.get_vpe(source).squeeze(0) - tpe= yoloe_model.get_text_pe(texts).squeeze(0) + tpe = yoloe_model.get_text_pe(texts).squeeze(0) # normalize - vpe= torch.nn.functional.normalize(vpe,dim=-1,p=2) - tpe= torch.nn.functional.normalize(tpe,dim=-1,p=2) + vpe = torch.nn.functional.normalize(vpe, dim=-1, p=2) + tpe = torch.nn.functional.normalize(tpe, dim=-1, p=2) # cal the similarity and return the text for each box similarities = (vpe @ tpe.T).softmax(dim=-1) # (N, M) matched_indices = similarities.argmax(dim=-1) # (N,) matched_texts = [texts[i] for i in matched_indices.tolist()] return matched_texts - - - - def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: - """ - Load annotations from a JSON file, filter, and normalize bounding boxes for each image. + """Load annotations from a JSON file, filter, and normalize bounding boxes for each image. Args: path (Path): Path where to save the cache file. @@ -89,7 +86,6 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: with open(self.json_file) as f: annotations = json.load(f) - # images = {f"{im['id']:d}": im for im in annotations["images"]} # Map image IDs to file names @@ -101,9 +97,6 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: imid = ann["image_id"] imname = imid_imname[f"{imid:d}"] imname_anns[imname].append(ann) - - - # # map sample id to the annotations # img_ids= [im["id"] for im in annotations["images"]] @@ -116,13 +109,10 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: for ann in annotations["annotations"]: imid_anns[ann["image_id"]].append(ann) - - if not hasattr(self, 'model') or self.model is None: + if not hasattr(self, "model") or self.model is None: self.load_yoloe() - for img_id, anns in TQDM(imid_anns.items(), desc=f"Reading annotations {self.json_file}"): - # if img_id > 16*10: break # for testing img = images[f"{img_id:d}"] h, w, f = img["height"], img["width"], img["file_name"] @@ -130,18 +120,19 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: if not im_file.exists(): continue self.im_files.append(str(im_file)) - bboxes_xyxy=[] + bboxes_xyxy = [] bboxes = [] segments = [] cat2id = {} texts = [] - - anns_for_img=imname_anns[f] + anns_for_img = imname_anns[f] for ann in anns + anns_for_img: - - if len(bboxes_xyxy) > 0 and YoloBox([int(h),int(w)]).load_from_xyxy(bboxes_xyxy).iou(ann["bbox"]).max()>0.98: + if ( + len(bboxes_xyxy) > 0 + and YoloBox([int(h), int(w)]).load_from_xyxy(bboxes_xyxy).iou(ann["bbox"]).max() > 0.98 + ): # print("skip duplicate box") continue if ann["iscrowd"]: @@ -162,7 +153,7 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: cat2id[cat_name] = len(cat2id) texts.append([cat_name]) cls = cat2id[cat_name] # class - box = [cls] + box.tolist() + box = [cls, *box.tolist()] if box not in bboxes: bboxes.append(box) if ann.get("segmentation") is not None: @@ -179,10 +170,9 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: .reshape(-1) .tolist() ) - s = [cls] + s + s = [cls, *s] segments.append(s) - bboxes_xyxy.append(ann["bbox"]) # add xyxy box for iou calculation - + bboxes_xyxy.append(ann["bbox"]) # add xyxy box for iou calculation lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32) @@ -192,32 +182,29 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh) lb = np.array(lb, dtype=np.float32) - label= { - "im_file": im_file, - "shape": (h, w), - "cls": lb[:, 0:1], # n, 1 - "bboxes": lb[:, 1:], # n, 4 - "segments": segments, - "normalized": True, - "bbox_format": "xywh", - "texts": texts, - } + label = { + "im_file": im_file, + "shape": (h, w), + "cls": lb[:, 0:1], # n, 1 + "bboxes": lb[:, 1:], # n, 4 + "segments": segments, + "normalized": True, + "bbox_format": "xywh", + "texts": texts, + } # - x["labels"].append(label) + ####### append boxes - - ####### append boxes - - batch_size=64 + batch_size = 64 - self.data_style="grounding" - for start in tqdm(range(0,len(x["labels"]),batch_size)): - batch_indices=list(range(start,min(start+batch_size,len(x["labels"])))) - batch_texts=[] + self.data_style = "grounding" + for start in tqdm(range(0, len(x["labels"]), batch_size)): + batch_indices = list(range(start, min(start + batch_size, len(x["labels"])))) + batch_texts = [] for indice in batch_indices: label_texts = x["labels"][indice].get("texts", []) if isinstance(label_texts, list): @@ -233,66 +220,58 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: else: self.set_classes(name_list=None) - results=self.yoloe_predict_batch([ x["labels"][i] for i in batch_indices ], conf=0.1,iou=0.4) - assert len(results)==len(batch_indices), "Mismatch between results and batch_indices length" - for indice,res in zip(batch_indices,results): - iou=0.1 # append new boxes when iou < 0.1 - replace=False # do not replace existing boxes - x["labels"][indice]= self._update_grounding_label(x["labels"][indice],res,iou=iou,replace=replace) - - + results = self.yoloe_predict_batch([x["labels"][i] for i in batch_indices], conf=0.1, iou=0.4) + assert len(results) == len(batch_indices), "Mismatch between results and batch_indices length" + for indice, res in zip(batch_indices, results): + iou = 0.1 # append new boxes when iou < 0.1 + replace = False # do not replace existing boxes + x["labels"][indice] = self._update_grounding_label(x["labels"][indice], res, iou=iou, replace=replace) self.load_yoloe() # reload to reset class number ##### refine the bbox texts imname_image = {im["file_name"]: im for im in annotations["images"]} - for indice,label in tqdm(enumerate(x["labels"]), desc="Refining texts for grounding data"): - bboxes_xyxy= YoloBox((int(label["shape"][0]),int(label["shape"][1]))).load_from_xywhn_normalized(label["bboxes"]).xyxy - visual={"bboxes": bboxes_xyxy, - "cls": list(range(bboxes_xyxy.shape[0]))} - texts= [] + for indice, label in tqdm(enumerate(x["labels"]), desc="Refining texts for grounding data"): + bboxes_xyxy = ( + YoloBox((int(label["shape"][0]), int(label["shape"][1]))) + .load_from_xywhn_normalized(label["bboxes"]) + .xyxy + ) + visual = {"bboxes": bboxes_xyxy, "cls": list(range(bboxes_xyxy.shape[0]))} + texts = [] for text_list in label["texts"]: texts.extend(text_list) - print("original texts for image ", ":", texts) - caption= imname_image[label["im_file"].name]["caption"].replace(".","") - caption_texts= caption.split() + print("original texts for image ", ":", texts) + caption = imname_image[label["im_file"].name]["caption"].replace(".", "") + caption_texts = caption.split() texts.extend(caption_texts) - print("caption_texts for image ", ":", caption_texts) - texts= list(set(texts)) - matched_texts= self.vpe_text(source= label["im_file"], visual_prompts= visual, texts= texts) - matches_texts_set= list(set(matched_texts)) - label['texts']= [[text] for text in matches_texts_set] + print("caption_texts for image ", ":", caption_texts) + texts = list(set(texts)) + matched_texts = self.vpe_text(source=label["im_file"], visual_prompts=visual, texts=texts) + matches_texts_set = list(set(matched_texts)) + label["texts"] = [[text] for text in matches_texts_set] # take cls as the index in the matched texts set - label["cls"]= [ matches_texts_set.index(text) for text in matched_texts ] - + label["cls"] = [matches_texts_set.index(text) for text in matched_texts] - - print(label['cls']) + print(label["cls"]) print(matched_texts) - print(label['texts']) - x["labels"][indice]= label - - + print(label["texts"]) + x["labels"][indice] = label x["hash"] = get_hash(self.json_file) - save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION) return x - - - - # """ # refine the text for grounding data by running grounding prediction and updating the texts. # how to set classes: # 1. collect all texts in the current batch. -# 2. do such refinement for each image in the batch: +# 2. do such refinement for each image in the batch: -# # read from the json file as +# # read from the json file as # ["two people"] -> ["two", "people","two people"] # ["what"]. -> update according to the grounding prediction results. @@ -301,12 +280,10 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: # """ - - # def load_src_json(self, json_path): # """ -# read the original json file for grounding dataset. +# read the original json file for grounding dataset. # """ # import json @@ -333,18 +310,18 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: # print("number of annotations:", len(data["annotations"])) # print("-"*40) # print("example image entry:", data["images"][0]) -# print("exmple category entry:") +# print("example category entry:") # for i in range(4): # print("-"*40) -# print( data["annotations"][i]) +# print( data["annotations"][i]) -# return +# return # json_data = { # "file_names": [], # "images": {}, -# "annotations": {} +# "annotations": {} # } # for x in data["images"]: # file_name = x["file_name"] @@ -355,7 +332,7 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: # print # self.json_data = json_data - + # def get_captions_texts(self,file_name): # """ # get all captions and split them into texts @@ -373,15 +350,14 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: # return captions, caption_texts - # de=DataEngine() # load_src_json(de, "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.json") # cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache" # text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" -# # de.load_cached_label(cache_path=cache_path, -# # data_style="grounding", +# # de.load_cached_label(cache_path=cache_path, +# # data_style="grounding", # # text_embed_pt=text_embed_pt) # # de.load_yoloe() @@ -392,32 +368,22 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]: # # def predict_and_update_text(self,indice): +DATA_DIR = "../datasets/" -from ultralytics import YOLOE -from ultralytics.models.yolo.yoloe import YOLOEVPTrainer - - - - - -DATA_DIR="../datasets/" - -Objects365v1="../datasets/Objects365v1.yaml" +Objects365v1 = "../datasets/Objects365v1.yaml" import argparse + parser = argparse.ArgumentParser() -parser.add_argument('--img_path', type=str, default=DATA_DIR+"flickr/full_images/") -parser.add_argument('--json_file', type=str, default=DATA_DIR+"flickr/annotations/final_flickr_separateGT_train_segm.json") +parser.add_argument("--img_path", type=str, default=DATA_DIR + "flickr/full_images/") +parser.add_argument( + "--json_file", type=str, default=DATA_DIR + "flickr/annotations/final_flickr_separateGT_train_segm.json" +) args = parser.parse_args() - -data= RefineGroundingDataset( - img_path=args.img_path, - json_file=args.json_file, - ) - - - - +data = RefineGroundingDataset( + img_path=args.img_path, + json_file=args.json_file, +) diff --git a/remove_segment.py b/remove_segment.py index 1f554d5..a6a3ac7 100644 --- a/remove_segment.py +++ b/remove_segment.py @@ -1,28 +1,26 @@ from data_engine import DataEngine -if __name__=="__main__": - - de=DataEngine(device="cuda") - yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml" - cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache" - de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config) + +if __name__ == "__main__": + de = DataEngine(device="cuda") + yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml" + cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache" + de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config) de.remove_masks_and_segments() de.save_cached_label(save_path=cache_path) - - de=DataEngine() - cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache" - text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, data_style="grounding", - text_embed_pt=text_embed_pt) + de = DataEngine() + cache_path = ( + "/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache" + ) + text_embed_pt = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.remove_masks_and_segments() de.save_cached_label(save_path=cache_path) - de=DataEngine() - cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache" - text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" - de.load_cached_label(cache_path=cache_path, - data_style="grounding", - text_embed_pt=text_embed_pt) + de = DataEngine() + cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache" + text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt" + de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt) de.remove_masks_and_segments() de.save_cached_label(save_path=cache_path) diff --git a/utils.py b/utils.py index 93f2543..9e4c71a 100644 --- a/utils.py +++ b/utils.py @@ -1,11 +1,8 @@ -import os from pathlib import Path def get_img_num(folder): - """ - calculate the number of images in a folder - """ + """Calculate the number of images in a folder.""" img_suffix = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"] img_num = 0 for suffix in img_suffix: @@ -13,33 +10,25 @@ def get_img_num(folder): return img_num - def get_json_num(folder): - """ - calculate the number of json files in a folder - """ + """Calculate the number of json files in a folder.""" json_num = len(list(Path(folder).rglob("*.json"))) return json_num - -flickr_res_json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict" +flickr_res_json_dir = "/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict" print("number of json files:", get_json_num(flickr_res_json_dir)) -flickr_img_dir="/root/ultra_louis_work/datasets/flickr/full_images" +flickr_img_dir = "/root/ultra_louis_work/datasets/flickr/full_images" print("number of flickr images:", get_img_num(flickr_img_dir)) - -mixed_img_dir="/root/ultra_louis_work/datasets/mixed_grounding/gqa/images" +mixed_img_dir = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/images" print("number of images:", get_img_num(mixed_img_dir)) - -mixed_res_json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict" +mixed_res_json_dir = "/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict" print("number of json files:", get_json_num(mixed_res_json_dir)) - - diff --git a/visual_json.py b/visual_json.py index 5d86e8e..790440c 100644 --- a/visual_json.py +++ b/visual_json.py @@ -1,15 +1,16 @@ +from __future__ import annotations + import json import os from pathlib import Path -from typing import Any, Dict, List +from typing import Any import numpy as np import torch -from PIL import Image import ultralytics +from PIL import Image from ultralytics.engine.results import Results - workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__))) os.chdir(workspace) print("set workspace:", workspace) @@ -18,9 +19,8 @@ from data_engine_agent import Instance, Sample # noqa: E402 pylint: disable=C0413 -def _ensure_sequence(value: Any) -> List[Any]: +def _ensure_sequence(value: Any) -> list[Any]: """Normalize single values to a list while leaving iterables intact.""" - if value is None: return [] if isinstance(value, (list, tuple)): @@ -30,13 +30,12 @@ def _ensure_sequence(value: Any) -> List[Any]: def load_from_json(json_path: Path | str) -> Sample: """Load a `Sample` from a JSON file generated by the data engine.""" - json_path = Path(json_path) if not json_path.is_file(): raise FileNotFoundError(f"Sample JSON not found: {json_path}") with json_path.open("r", encoding="utf-8") as handle: - payload: Dict[str, Any] = json.load(handle) + payload: dict[str, Any] = json.load(handle) sample = Sample() sample.im_file = payload.get("im_file") @@ -78,7 +77,6 @@ def load_from_json(json_path: Path | str) -> Sample: def _resolve_image_path(sample: Sample, image_root: Path | str | None = None) -> Path: """Resolve the image path for a sample, considering optional root hints.""" - if sample.im_file is None: raise ValueError("Sample does not specify an image file") @@ -103,25 +101,23 @@ def _resolve_image_path(sample: Sample, image_root: Path | str | None = None) -> raise FileNotFoundError(f"Unable to locate image file for sample: {sample.im_file}") -def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> List[Results]: +def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> list[Results]: """Convert a `Sample` into a list containing a single Ultralytics `Results` object.""" - img_path = _resolve_image_path(sample, image_root=image_root) orig_img = np.array(Image.open(img_path).convert("RGB")) - text_instances={} + text_instances = {} for inst in sample.instances: if inst.text[0] not in text_instances.keys(): - text_instances[inst.text[0]]=[] + text_instances[inst.text[0]] = [] text_instances[inst.text[0]].append(inst) text_result = {} for text, instances in text_instances.items(): - - boxes_data: List[List[float]] = [] - names: List[str] = [] - name_to_idx: Dict[str, int] = {} + boxes_data: list[list[float]] = [] + names: list[str] = [] + name_to_idx: dict[str, int] = {} for inst in instances: bbox_array = np.array(inst.bbox, dtype=np.float32).reshape(-1, 4) @@ -137,16 +133,22 @@ def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> L conf_value = float(inst.conf[0]) if inst.conf else 0.0 for bbox in bbox_array: - boxes_data.append([ - float(bbox[0]), - float(bbox[1]), - float(bbox[2]), - float(bbox[3]), - conf_value, - cls_idx, - ]) - - boxes_tensor = torch.from_numpy(np.array(boxes_data, dtype=np.float32)) if boxes_data else torch.zeros((0, 6), dtype=torch.float32) + boxes_data.append( + [ + float(bbox[0]), + float(bbox[1]), + float(bbox[2]), + float(bbox[3]), + conf_value, + cls_idx, + ] + ) + + boxes_tensor = ( + torch.from_numpy(np.array(boxes_data, dtype=np.float32)) + if boxes_data + else torch.zeros((0, 6), dtype=torch.float32) + ) names_dict = {idx: name for idx, name in enumerate(names)} result = Results( @@ -155,12 +157,12 @@ def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> L names=names_dict, boxes=boxes_tensor, ) - text_result[text]=result + text_result[text] = result return text_result + def visualize_sample(sample: Sample, dst_vis_img: Path | str, image_root: Path | str | None = None) -> Path: """Render sample predictions to an image and save it to ``dst_vis_img``.""" - text_result = sample_to_results(sample, image_root=image_root) dst_vis_path = Path(dst_vis_img) dst_vis_path.parent.mkdir(parents=True, exist_ok=True) @@ -183,4 +185,4 @@ def visualize_sample(sample: Sample, dst_vis_img: Path | str, image_root: Path | print(f"Loaded {len(sample.instances)} instances from {sample.im_file}") output_path = Path("../visual_json/visual_img.jpg") saved_path = visualize_sample(sample, output_path) - print(f"Saved visualization to {saved_path}") \ No newline at end of file + print(f"Saved visualization to {saved_path}")