From 254707b53be28d6fcdaf6acfb8ac270ac06a77d4 Mon Sep 17 00:00:00 2001
From: Tejaswini Jayashanker <tejaswini.j@intel.com>
Date: Tue, 9 Dec 2025 09:03:32 +0530
Subject: [PATCH 1/2] Updated readme.md

Signed-off-by: Tejaswini Jayashanker <tejaswini.j@intel.com>
---
 readme.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/readme.md b/readme.md
index 75a58bd..0802e8c 100644
--- a/readme.md
+++ b/readme.md
@@ -2,17 +2,17 @@
 
 
 #  pipline of the data engine
-###   read the grounding data from json file
+###   read the grounding data from the JSON file
     for each sample, per-store the others samples sharing the same image.
     add the 
 
-###   model predict and save the jons files
+###   model predict and save the JSON files
 -     visual to check the json files
 -  found that some boxes are overlapped heavily, with different text
 - how to deal with these boxes? 
 
 ###  merge model prediction to label,
-- discard the bbox with higher iou  ( > 0.8, higher iou , no consider the class or text)
+- discard the bbox with higher iou  ( > 0.8, higher iou, no consider the class or text)
 
 
 
@@ -21,7 +21,7 @@
 -    generate the visual prompt embedding for each instance (bbox)
 
 
--    merge bboxes within the same image ( consider the vpe distance  and text similarity ,bbox iou<0.8 )
+-    merge bboxes within the same image ( consider the vpe distance  and text similarity,bbox iou<0.8 )
 
 
 -    transfer to grounding format cache for training
@@ -31,4 +31,4 @@
 
 to do:
 
-write a tools to visual the bbox ious. within the same images 
\ No newline at end of file
+Write a tool to visualize the BBox IoUs within the same image.

From 3e6afe343285a6d4377dbddaa842a8b34271d854 Mon Sep 17 00:00:00 2001
From: UltralyticsAssistant <web@ultralytics.com>
Date: Tue, 9 Dec 2025 03:34:20 +0000
Subject: [PATCH 2/2] Auto-format by https://ultralytics.com/actions

---
 data_engine.py                  | 549 +++++++++++++++-----------------
 data_engine_agent.py            | 247 +++++++-------
 data_visual.py                  |  42 +--
 data_visual_flickr.py           |  39 +--
 data_visual_mixed.py            |  37 +--
 data_visual_object365.py        |  34 +-
 do_flickr.sh                    |  11 +-
 do_mixed.sh                     |  14 +-
 grounding_dataset_visualizer.py | 236 +++++++-------
 log.md                          |  14 +-
 readme.md                       |  32 +-
 refine_text.py                  | 240 ++++++--------
 remove_segment.py               |  34 +-
 utils.py                        |  23 +-
 visual_json.py                  |  60 ++--
 15 files changed, 742 insertions(+), 870 deletions(-)

diff --git a/data_engine.py b/data_engine.py
index b2bd18d..1b37420 100644
--- a/data_engine.py
+++ b/data_engine.py
@@ -1,39 +1,43 @@
-import ultralytics,os
+import os
+
+import ultralytics
+
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
 
 
-import numpy as np
-from pathlib import Path
 import os
-from PIL import Image, ImageDraw, ImageFont
+from pathlib import Path
 
+import numpy as np
+from PIL import Image
 from ultralytics.data.utils import load_dataset_cache_file
 from ultralytics.engine.results import Results
 
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 
+
 def get_names_from_yaml_config(yaml_config):
     import yaml
+
     if not os.path.exists(yaml_config):
         raise FileNotFoundError(f"YAML config file not found: {yaml_config}")
-    with open(yaml_config, 'r') as f:
+    with open(yaml_config) as f:
         data_dict = yaml.safe_load(f)
-        names = data_dict['names']
+        names = data_dict["names"]
     return names
 
 
-class YoloBox(object):
-
-    def __init__(self,img_shape:list):
-        assert len(img_shape)==2, "img_sz should be (height,width)"
-        self.img_h=img_shape[0]
-        self.img_w=img_shape[1]
-        self.xyxy=None
-        self.xywhn=None # normalized xywh
+class YoloBox:
+    def __init__(self, img_shape: list):
+        assert len(img_shape) == 2, "img_sz should be (height,width)"
+        self.img_h = img_shape[0]
+        self.img_w = img_shape[1]
+        self.xyxy = None
+        self.xywhn = None  # normalized xywh
 
-    def load_from_xywhn_normalized(self,bboxes_xywhn):
+    def load_from_xywhn_normalized(self, bboxes_xywhn):
         # xywhn: [N,4]  x_center,y_center,w,h (normalized)
         bboxes_xyxy = np.zeros_like(bboxes_xywhn)
         if bboxes_xywhn.shape[0] > 0:
@@ -42,14 +46,15 @@ def load_from_xywhn_normalized(self,bboxes_xywhn):
             bboxes_xyxy[:, 2] = (bboxes_xywhn[:, 0] + bboxes_xywhn[:, 2] / 2) * self.img_w
             bboxes_xyxy[:, 3] = (bboxes_xywhn[:, 1] + bboxes_xywhn[:, 3] / 2) * self.img_h
 
-        self.xyxy=bboxes_xyxy
+        self.xyxy = bboxes_xyxy
 
-        self.xywhn=bboxes_xywhn
+        self.xywhn = bboxes_xywhn
         return self
-    def load_from_xyxy(self,bboxes_xyxy):
-        if isinstance(bboxes_xyxy,list):
-            bboxes_xyxy=np.array(bboxes_xyxy)
-        
+
+    def load_from_xyxy(self, bboxes_xyxy):
+        if isinstance(bboxes_xyxy, list):
+            bboxes_xyxy = np.array(bboxes_xyxy)
+
         # Ensure the array is of a numeric type
         bboxes_xyxy = np.array(bboxes_xyxy, dtype=np.float32)
 
@@ -68,367 +73,350 @@ def load_from_xyxy(self,bboxes_xyxy):
             bboxes_xywhn[:, 2] = (bboxes_xyxy[:, 2] - bboxes_xyxy[:, 0]) / self.img_w
             bboxes_xywhn[:, 3] = (bboxes_xyxy[:, 3] - bboxes_xyxy[:, 1]) / self.img_h
 
-        self.xyxy=bboxes_xyxy
+        self.xyxy = bboxes_xyxy
 
-        self.xywhn=bboxes_xywhn
+        self.xywhn = bboxes_xywhn
         return self
-    
-    def iou(self,bbox_xyxy):
+
+    def iou(self, bbox_xyxy):
         # bbox_xyxy: [4,]  x0,y0,x1,y1
         assert self.xyxy is not None, "self.xyxy is None, please load the box first"
-        ious=[]
+        ious = []
         for i in range(self.xyxy.shape[0]):
-            box=self.xyxy[i]
-            xi1=max(box[0],bbox_xyxy[0])
-            yi1=max(box[1],bbox_xyxy[1])
-            xi2=min(box[2],bbox_xyxy[2])
-            yi2=min(box[3],bbox_xyxy[3])
-            inter_area=max(0,xi2-xi1)*max(0,yi2-yi1)
-            box1_area=(box[2]-box[0])*(box[3]-box[1])
-            box2_area=(bbox_xyxy[2]-bbox_xyxy[0])*(bbox_xyxy[3]-bbox_xyxy[1])
-            union_area=box1_area+box2_area-inter_area
-            iou=inter_area/union_area if union_area>0 else 0
+            box = self.xyxy[i]
+            xi1 = max(box[0], bbox_xyxy[0])
+            yi1 = max(box[1], bbox_xyxy[1])
+            xi2 = min(box[2], bbox_xyxy[2])
+            yi2 = min(box[3], bbox_xyxy[3])
+            inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
+            box1_area = (box[2] - box[0]) * (box[3] - box[1])
+            box2_area = (bbox_xyxy[2] - bbox_xyxy[0]) * (bbox_xyxy[3] - bbox_xyxy[1])
+            union_area = box1_area + box2_area - inter_area
+            iou = inter_area / union_area if union_area > 0 else 0
             ious.append(iou)
         return np.array(ious)
 
 
 import torch
 
+
 class DataEngine:
-    
-    def __init__(self,device="cuda"):
-        self.device=device
+    def __init__(self, device="cuda"):
+        self.device = device
 
     def load_yoloe(self):
         from ultralytics import YOLOE
 
-        model_path="/root/ultra_louis_work/ultralytics/yoloe-11l-seg.pt"
-        yaml_file="yoloe-11l-seg.yaml"
-        if hasattr(self,'model'):
+        model_path = "/root/ultra_louis_work/ultralytics/yoloe-11l-seg.pt"
+        yaml_file = "yoloe-11l-seg.yaml"
+        if hasattr(self, "model"):
             # clear the existing model
             del self.model
 
             torch.cuda.empty_cache()
-            
 
-        self.model=YOLOE(yaml_file).load(model_path).to(self.device)
+        self.model = YOLOE(yaml_file).load(model_path).to(self.device)
         print("load model from:", model_path)
 
-    def set_classes(self,yaml_config=None,name_list=None, text_embed_pt=None):
-
+    def set_classes(self, yaml_config=None, name_list=None, text_embed_pt=None):
         # only one of yaml_config and name_list should be provided
         assert (yaml_config is None) or (name_list is None), "Only one of yaml_config and name_list should be provided"
         if yaml_config is not None:
-
             assert name_list is None, "If yaml_config is provided, name_list should be None"
             name_list = get_names_from_yaml_config(yaml_config)
-            name_list=list(name_list.values())
+            name_list = list(name_list.values())
             print("Load names from yaml:", yaml_config)
 
         if text_embed_pt is not None:
             assert os.path.exists(text_embed_pt), f"Text embed pt file not found: {text_embed_pt}"
-            txt_map= torch.load(text_embed_pt, map_location=self.device)
-            name_list=list(txt_map.keys())
+            txt_map = torch.load(text_embed_pt, map_location=self.device)
+            name_list = list(txt_map.keys())
             print("Load text embed from:", text_embed_pt)
 
+        assert name_list is None or isinstance(name_list, list), "name_list should be a list of strings or None"
 
-        assert name_list is None or isinstance(name_list,list), "name_list should be a list of strings or None"
-
-        if name_list is not None :
+        if name_list is not None:
             print(f"Set {len(name_list)} classes")
             self.model.set_classes(name_list, self.model.get_text_pe(name_list))
-            self.names=name_list
+            self.names = name_list
 
         else:
             print("No classes set")
 
-
-
-    def yoloe_predict(self,indice,conf=0.05,save_path=None):
-        img_file=self.labels[indice]['im_file']
-        if hasattr(self,'img_source'):
-            img_file=os.path.join(self.img_source,img_file)
-        result=self.model.predict(img_file,conf=conf)
+    def yoloe_predict(self, indice, conf=0.05, save_path=None):
+        img_file = self.labels[indice]["im_file"]
+        if hasattr(self, "img_source"):
+            img_file = os.path.join(self.img_source, img_file)
+        result = self.model.predict(img_file, conf=conf)
 
         if save_path is not None:
             result[0].save(save_path)
             print("save to:", save_path)
         return result
 
-    def yoloe_predict_batch(self, labels, conf=0.05,iou=0.4):
-        img_files=[]
+    def yoloe_predict_batch(self, labels, conf=0.05, iou=0.4):
+        img_files = []
         for label in labels:
-            img_file=label['im_file']
-            if hasattr(self,'img_source'):
-                img_file=os.path.join(self.img_source,img_file)
+            img_file = label["im_file"]
+            if hasattr(self, "img_source"):
+                img_file = os.path.join(self.img_source, img_file)
             img_files.append(img_file)
         if not img_files:
             return []
-        
-        return list(self.model.predict(img_files, conf=conf,iou=iou, batch=len(img_files),stream=True))
 
+        return list(self.model.predict(img_files, conf=conf, iou=iou, batch=len(img_files), stream=True))
 
     def __len__(self):
         return len(self.labels)
 
+    def set_img_folder(self, img_source):
+        self.img_source = img_source
 
+    def load_cached_label(self, cache_path, data_style="grounding", yaml_config=None, text_embed_pt=None):
+        self.cache_path = cache_path
 
-
-    def set_img_folder(self,img_source):
-        self.img_source=img_source
-
-    def load_cached_label(self,cache_path, data_style="grounding",yaml_config=None, text_embed_pt=None ):
-        
-        self.cache_path=cache_path
-
-        cache=load_dataset_cache_file(Path(cache_path))
-        self.cache=cache
-        self.labels=cache["labels"]
+        cache = load_dataset_cache_file(Path(cache_path))
+        self.cache = cache
+        self.labels = cache["labels"]
         print(len(self.labels))
 
-        assert data_style in ["grounding","detection"]
-        self.data_style=data_style
-        
-        if data_style=="detection":
+        assert data_style in ["grounding", "detection"]
+        self.data_style = data_style
+
+        if data_style == "detection":
             assert yaml_config is not None, "yaml_config must be provided for detection data_style"
             if not os.path.exists(yaml_config):
                 raise FileNotFoundError(f"YAML config file not found: {yaml_config}")
-            self.yaml_config=yaml_config
+            self.yaml_config = yaml_config
 
-                 # read names from the  yaml file
+            # read names from the  yaml file
             import yaml
-            with open(yaml_config, 'r') as f:
+
+            with open(yaml_config) as f:
                 data_dict = yaml.safe_load(f)
-                self.names = data_dict['names']
+                self.names = data_dict["names"]
 
-        elif data_style=="grounding":
+        elif data_style == "grounding":
             assert text_embed_pt is not None, "text_embed_pt must be provided for grounding data_style"
-            self.text_embed_pt=text_embed_pt
+            self.text_embed_pt = text_embed_pt
             if not os.path.exists(text_embed_pt):
                 raise FileNotFoundError(f"Text embed pt file not found: {text_embed_pt}")
             else:
                 print("Load text embed from:", text_embed_pt)
             txt_map = torch.load(text_embed_pt, map_location=self.device, weights_only=False)
-            self.names=list(txt_map.keys()) 
-
+            self.names = list(txt_map.keys())
 
     def print_data_info(self):
-        """
-        Print information about the dataset labels:
-        - data_style
-        - Total number of labels
-        - Total number of boxes (for detection and grounding)
+        """Print information about the dataset labels: - data_style - Total number of labels - Total number of boxes
+        (for detection and grounding).
         """
         print(f"Data style: {self.data_style}")
-        print("Keys: {}".format(self.labels[0].keys() if len(self.labels)>0 else "No labels")   )
+        print("Keys: {}".format(self.labels[0].keys() if len(self.labels) > 0 else "No labels"))
         print(f"Total number of labels: {len(self.labels)}")
         if self.data_style in ["detection", "grounding"]:
             total_boxes = sum(len(label.get("bboxes", [])) for label in self.labels)
             print(f"Total number of boxes: {total_boxes}")
 
-
     def remove_masks_and_segments(self):
         for label in tqdm(self.labels):
-            label["segments"]=[]
+            label["segments"] = []
 
-    def save_cached_label(self,save_path=None):
+    def save_cached_label(self, save_path=None):
         if save_path is None:
-            save_path=self.cache_path
+            save_path = self.cache_path
         from copy import deepcopy
-        copy_cache=deepcopy(self.cache)
-        copy_cache["labels"]=self.labels
+
+        copy_cache = deepcopy(self.cache)
+        copy_cache["labels"] = self.labels
         with open(save_path, "wb") as f:
             np.save(f, copy_cache)
 
-    def print_one_label(self,indice):
-        
-        label=self.labels[indice]
+    def print_one_label(self, indice):
+        label = self.labels[indice]
         # print(self.labels[indice])
         print(label.keys())
-        print(label['im_file'])
-        for key,val in label.items():
+        print(label["im_file"])
+        for key, val in label.items():
             print(f"{key}: {type(val)}")
-            if isinstance(val,list):
+            if isinstance(val, list):
                 print(f"  Length: {len(val)}")
                 # if len(val)>0:
                 #     print(f"  First 3 elements: {val[:3]}")
 
-            elif isinstance(val,np.ndarray):
+            elif isinstance(val, np.ndarray):
                 print(f"  Shape: {val.shape}")
                 print(f"  Dtype: {val.dtype}")
                 print(f"  First 5 elements: {val.flatten()[:5]}")
-            
-            elif isinstance(val,dict):
+
+            elif isinstance(val, dict):
                 print(f"  Dict with keys: {list(val.keys())}")
 
             else:
                 print(f"  Value: {val}")
 
-
-    def detection_predict_and_update_labels(self,indice,iou=0.3,replace=True,conf=0.1):
-        result=self.yoloe_predict(indice=indice,conf=conf)
+    def detection_predict_and_update_labels(self, indice, iou=0.3, replace=True, conf=0.1):
+        result = self.yoloe_predict(indice=indice, conf=conf)
         if not result:
             return
-        self._update_detection_label(indice,result[0],iou=iou,replace=replace)
+        self._update_detection_label(indice, result[0], iou=iou, replace=replace)
 
     def detection_predict_and_update_labels_batch(self, indices, iou=0.3, replace=False, conf=0.1):
-        results=self.yoloe_predict_batch([ self.labels[i] for i in indices ], conf=conf)
-        assert len(results)==len(indices), "Mismatch between results and indices length"
-        for indice,res in zip(indices,results):
-            self._update_detection_label(indice,res,iou=iou,replace=replace)
+        results = self.yoloe_predict_batch([self.labels[i] for i in indices], conf=conf)
+        assert len(results) == len(indices), "Mismatch between results and indices length"
+        for indice, res in zip(indices, results):
+            self._update_detection_label(indice, res, iou=iou, replace=replace)
 
     def _update_detection_label(self, indice, result_obj, iou=0.3, replace=True):
         assert self.data_style == "detection", "_update_detection_label requires detection data_style"
-        boxes=result_obj.boxes
-        bboxes_xyxy=boxes.xyxy.cpu().numpy()
-        yolo_box=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy)
-        bboxes_xywhn=yolo_box.xywhn
-        cls=boxes.cls.cpu().numpy()
-        assert bboxes_xywhn.shape[0]==cls.shape[0], "Mismatch between number of boxes and classes"
+        boxes = result_obj.boxes
+        bboxes_xyxy = boxes.xyxy.cpu().numpy()
+        yolo_box = YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy)
+        bboxes_xywhn = yolo_box.xywhn
+        cls = boxes.cls.cpu().numpy()
+        assert bboxes_xywhn.shape[0] == cls.shape[0], "Mismatch between number of boxes and classes"
         if replace:
-            self.labels[indice]['bboxes']=bboxes_xywhn
-            self.labels[indice]['cls']=cls
+            self.labels[indice]["bboxes"] = bboxes_xywhn
+            self.labels[indice]["cls"] = cls
             print(f"Replace with {bboxes_xywhn.shape[0]} boxes")
             return
-        keep_indices=[]
+        keep_indices = []
         for i in range(bboxes_xywhn.shape[0]):
-            bbox=bboxes_xywhn[i]
-            max_iou=0
-            for j in range(self.labels[indice]['bboxes'].shape[0]):
-                exist_bbox=self.labels[indice]['bboxes'][j]
-                box1=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(bbox[np.newaxis,:]).xyxy[0]
-                box2=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(exist_bbox[np.newaxis,:]).xyxy[0]
-                xi1=max(box1[0],box2[0])
-                yi1=max(box1[1],box2[1])
-                xi2=min(box1[2],box2[2])
-                yi2=min(box1[3],box2[3])
-                inter_area=max(0,xi2-xi1)*max(0,yi2-yi1)
-                box1_area=(box1[2]-box1[0])*(box1[3]-box1[1])
-                box2_area=(box2[2]-box2[0])*(box2[3]-box2[1])
-                union_area=box1_area+box2_area-inter_area
-                current_iou=inter_area/union_area if union_area>0 else 0
-                if current_iou>max_iou:
-                    max_iou=current_iou
-            if max_iou<iou:
+            bbox = bboxes_xywhn[i]
+            max_iou = 0
+            for j in range(self.labels[indice]["bboxes"].shape[0]):
+                exist_bbox = self.labels[indice]["bboxes"][j]
+                box1 = (
+                    YoloBox(img_shape=result_obj.orig_img.shape[:2])
+                    .load_from_xywhn_normalized(bbox[np.newaxis, :])
+                    .xyxy[0]
+                )
+                box2 = (
+                    YoloBox(img_shape=result_obj.orig_img.shape[:2])
+                    .load_from_xywhn_normalized(exist_bbox[np.newaxis, :])
+                    .xyxy[0]
+                )
+                xi1 = max(box1[0], box2[0])
+                yi1 = max(box1[1], box2[1])
+                xi2 = min(box1[2], box2[2])
+                yi2 = min(box1[3], box2[3])
+                inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
+                box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+                box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+                union_area = box1_area + box2_area - inter_area
+                current_iou = inter_area / union_area if union_area > 0 else 0
+                if current_iou > max_iou:
+                    max_iou = current_iou
+            if max_iou < iou:
                 keep_indices.append(i)
         print(f"Append {len(keep_indices)} new boxes out of {bboxes_xywhn.shape[0]}")
         for i in keep_indices:
-            bbox=bboxes_xywhn[i]
-            c=cls[i]
-            self.labels[indice]['bboxes']=np.vstack([self.labels[indice]['bboxes'],bbox])
-            self.labels[indice]['cls']=np.vstack([self.labels[indice]['cls'],c])
-
-    def grounding_predict_and_update_labels_batch(self,indices,iou=0.05,replace=False,conf=0.1):
-
-
-        results=self.yoloe_predict_batch(indices, conf=conf)
-        assert len(results)==len(indices), "Mismatch between results and indices length"
-        for indice,res in zip(indices,results):
-            self.labels[indice]= self._update_grounding_label(self.labels[indice],res,iou=iou,replace=replace)
+            bbox = bboxes_xywhn[i]
+            c = cls[i]
+            self.labels[indice]["bboxes"] = np.vstack([self.labels[indice]["bboxes"], bbox])
+            self.labels[indice]["cls"] = np.vstack([self.labels[indice]["cls"], c])
 
+    def grounding_predict_and_update_labels_batch(self, indices, iou=0.05, replace=False, conf=0.1):
+        results = self.yoloe_predict_batch(indices, conf=conf)
+        assert len(results) == len(indices), "Mismatch between results and indices length"
+        for indice, res in zip(indices, results):
+            self.labels[indice] = self._update_grounding_label(self.labels[indice], res, iou=iou, replace=replace)
 
     def _update_grounding_label(self, label, result_obj, iou=0.1, replace=True):
         assert self.data_style == "grounding", "_update_grounding_label requires grounding data_style"
-        boxes=result_obj.boxes
-        bboxes_xyxy=boxes.xyxy.cpu().numpy()
-        yolo_box=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy)
-        bboxes_xywhn=yolo_box.xywhn
-        cls=boxes.cls.cpu().numpy()      
+        boxes = result_obj.boxes
+        bboxes_xyxy = boxes.xyxy.cpu().numpy()
+        yolo_box = YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xyxy(bboxes_xyxy)
+        bboxes_xywhn = yolo_box.xywhn
+        cls = boxes.cls.cpu().numpy()
 
-        assert bboxes_xywhn.shape[0]==cls.shape[0], "Mismatch between number of boxes and classes"
+        assert bboxes_xywhn.shape[0] == cls.shape[0], "Mismatch between number of boxes and classes"
 
         if replace:
-            label['bboxes']=bboxes_xywhn
-            label['cls']=cls
-            print(f"Replace with {bboxes_xywhn.shape[0]} boxes")  
+            label["bboxes"] = bboxes_xywhn
+            label["cls"] = cls
+            print(f"Replace with {bboxes_xywhn.shape[0]} boxes")
             return
-        keep_indices=[]
+        keep_indices = []
         for i in range(bboxes_xywhn.shape[0]):
-            bbox=bboxes_xywhn[i]
-            max_iou=0
-            for j in range(label['bboxes'].shape[0]):
-                exist_bbox=label['bboxes'][j]
-                box1=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(bbox[np.newaxis,:]).xyxy[0]
-                box2=YoloBox(img_shape=result_obj.orig_img.shape[:2]).load_from_xywhn_normalized(exist_bbox[np.newaxis,:]).xyxy[0]
-                xi1=max(box1[0],box2[0])
-                yi1=max(box1[1],box2[1])
-                xi2=min(box1[2],box2[2])
-                yi2=min(box1[3],box2[3])
-                inter_area=max(0,xi2-xi1)*max(0,yi2-yi1)
-                box1_area=(box1[2]-box1[0])*(box1[3]-box1[1])
-                box2_area=(box2[2]-box2[0])*(box2[3]-box2[1])
-                union_area=box1_area+box2_area-inter_area
-                current_iou=inter_area/union_area if union_area>0 else 0
-                if current_iou>max_iou:
-                    max_iou=current_iou
-            if max_iou<iou:
+            bbox = bboxes_xywhn[i]
+            max_iou = 0
+            for j in range(label["bboxes"].shape[0]):
+                exist_bbox = label["bboxes"][j]
+                box1 = (
+                    YoloBox(img_shape=result_obj.orig_img.shape[:2])
+                    .load_from_xywhn_normalized(bbox[np.newaxis, :])
+                    .xyxy[0]
+                )
+                box2 = (
+                    YoloBox(img_shape=result_obj.orig_img.shape[:2])
+                    .load_from_xywhn_normalized(exist_bbox[np.newaxis, :])
+                    .xyxy[0]
+                )
+                xi1 = max(box1[0], box2[0])
+                yi1 = max(box1[1], box2[1])
+                xi2 = min(box1[2], box2[2])
+                yi2 = min(box1[3], box2[3])
+                inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
+                box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+                box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+                union_area = box1_area + box2_area - inter_area
+                current_iou = inter_area / union_area if union_area > 0 else 0
+                if current_iou > max_iou:
+                    max_iou = current_iou
+            if max_iou < iou:
                 keep_indices.append(i)
 
         # get  current texts
-        current_texts= label['texts']
-        current_texts= [ text[0] for text in current_texts] # remote the list structure inside
-
+        current_texts = label["texts"]
+        current_texts = [text[0] for text in current_texts]  # remote the list structure inside
 
         #  get all bbox that should be appended from result_obj  with their cls and texts
-        append_bboxes,append_cls,append_text = [],[],[]
+        append_bboxes, append_cls, append_text = [], [], []
         for i in keep_indices:
             append_bboxes.append(bboxes_xywhn[i])
             append_cls.append(cls[i])
             append_text.append(self.names[int(cls[i])])
 
-
-
         # update the current texts
         for text in append_text:
             if text not in current_texts:
                 current_texts.append(text)
-        label['texts'] = [[text] for text in current_texts]  # keep the list structure
-
+        label["texts"] = [[text] for text in current_texts]  # keep the list structure
 
         # update the append_cls to match the updated texts
-        updated_append_cls=[]
+        updated_append_cls = []
         for text in append_text:
-            updated_cls=current_texts.index(text) #
+            updated_cls = current_texts.index(text)  #
             updated_append_cls.append(updated_cls)
-        append_cls=updated_append_cls
+        append_cls = updated_append_cls
 
         # format
-        append_bboxes= np.array(append_bboxes).reshape(-1,4)
-        append_cls= np.array(append_cls).reshape(-1,1)
-
-        
+        append_bboxes = np.array(append_bboxes).reshape(-1, 4)
+        append_cls = np.array(append_cls).reshape(-1, 1)
 
         # append the boxes and cls
         for i in range(append_bboxes.shape[0]):
-            bbox=append_bboxes[i]
-            c=append_cls[i]
-            label['bboxes']=np.vstack([label['bboxes'],bbox])
-            label['cls']=np.vstack([label['cls'],c])
+            bbox = append_bboxes[i]
+            c = append_cls[i]
+            label["bboxes"] = np.vstack([label["bboxes"], bbox])
+            label["cls"] = np.vstack([label["cls"], c])
         # print how many boxes are appended
         print(f"Append {append_bboxes.shape[0]} new boxes out of {bboxes_xywhn.shape[0]}")
         return label
 
+    def label_append_instance(self, indice, bboxes, cls, texts=None):
+        assert len(bboxes) == len(cls), "Length of bboxes and cls must be the same"
 
-
-    def label_append_instance(self,indice,bboxes,cls,texts=None):
-        
-        assert len(bboxes)==len(cls), "Length of bboxes and cls must be the same"
-        
-
-
-
-    def visual_and_save2(self, indice=None,filename=None,
-                          save_path="./visualize2.jpg"):
+    def visual_and_save2(self, indice=None, filename=None, save_path="./visualize2.jpg"):
         """Visualizes a label using ultralytics.engine.results.Results and saves it."""
-        
-        assert self.data_style in ["grounding","detection"]
+        assert self.data_style in ["grounding", "detection"]
         print("Visualizing index:", indice)
 
-        if indice is None :
+        if indice is None:
             assert filename is not None, "Either indice or filename must be provided"
-            
+
             for idx, label in enumerate(self.labels):
-                im_file = label['im_file']
+                im_file = label["im_file"]
                 # if hasattr(self, 'img_source'):
                 #     im_file = os.path.join(self.img_source, im_file)
                 print(im_file)
@@ -440,23 +428,23 @@ def visual_and_save2(self, indice=None,filename=None,
 
         label = self.labels[indice]
         print("label keys:", label.keys())
-        im_file = label['im_file']
+        im_file = label["im_file"]
 
-        if hasattr(self, 'img_source'):
+        if hasattr(self, "img_source"):
             im_file = os.path.join(self.img_source, im_file)
 
         orig_img = np.array(Image.open(im_file))
         img_h, img_w = orig_img.shape[:2]
-        
-        bboxes_xywhn = label['bboxes']
-        cls = label['cls']
-        if self.data_style=="detection":
+
+        bboxes_xywhn = label["bboxes"]
+        cls = label["cls"]
+        if self.data_style == "detection":
             assert self.yaml_config is not None, "yaml_config must be provided for detection data_style"
             names = self.names
-        
-        elif self.data_style=="grounding":
-            names=label.get('texts',None)
-            names= [ text[0] for text in names] # remote the list structure inside
+
+        elif self.data_style == "grounding":
+            names = label.get("texts", None)
+            names = [text[0] for text in names]  # remote the list structure inside
 
         if isinstance(names, (list, tuple)):
             names = {int(i): str(n) for i, n in enumerate(names)}
@@ -503,73 +491,62 @@ def visual_and_save2(self, indice=None,filename=None,
         print("Boxes data shape:", boxes_tensor.shape)
 
         # Create Results object
-        result = Results(
-            orig_img=np.array(orig_img),
-            path=im_file,
-            names=names,
-            boxes=boxes_tensor
-        )
-        # print each bbox witth cls and name from the result object
+        result = Results(orig_img=np.array(orig_img), path=im_file, names=names, boxes=boxes_tensor)
+        # print each bbox with cls and name from the result object
         for i in range(result.boxes.shape[0]):
             box = result.boxes[i]
             cls_id = int(box.cls.item())
             cls_name = result.names.get(cls_id, "unknown")
             print(f"Box {i}: Class ID = {cls_id}, Class Name = {cls_name}, Box Coordinates = {box.xyxy.tolist()}")
-        
+
         print("Number of boxes in Results object:", len(result.boxes) if result.boxes is not None else 0)
         if result.boxes:
-            result.boxes.is_track = False # Set to false to avoid printing track_ids
+            result.boxes.is_track = False  # Set to false to avoid printing track_ids
 
         result.save(save_path)
 
-        
         # # Plot the results
 
         # im_array = result.plot(conf=False) # conf=False to not show confidence scores
-        
+
         # # Save the visualized image""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
         print(f"Saved visualization to {save_path}")
 
 
-
 from tqdm import tqdm
 
+if __name__ == "__main__":
+    DATA_NAME = "flickr"  #
 
-if __name__=="__main__":
-        
-    DATA_NAME="flickr" #
-
-    if DATA_NAME=="Objects365v1":
-        de=DataEngine(device="cuda")
-        yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml"
-        cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache"
+    if DATA_NAME == "Objects365v1":
+        de = DataEngine(device="cuda")
+        yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml"
+        cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache"
         de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config)
         de.load_yoloe()
-        de.set_classes(yaml_config=yaml_config) # set classes for the dataset
+        de.set_classes(yaml_config=yaml_config)  # set classes for the dataset
 
-        batch_size=64
-        for start in tqdm(range(0,len(de),batch_size)):
-            batch_indices=list(range(start,min(start+batch_size,len(de))))
-            de.detection_predict_and_update_labels_batch(batch_indices,iou=0.1,conf=0.1)
+        batch_size = 64
+        for start in tqdm(range(0, len(de), batch_size)):
+            batch_indices = list(range(start, min(start + batch_size, len(de))))
+            de.detection_predict_and_update_labels_batch(batch_indices, iou=0.1, conf=0.1)
         de.save_cached_label(save_path=cache_path.replace(".cache", "_updated.cache"))
 
-    elif DATA_NAME=="mixed_grounding":
-
-
-        # set gpu 3 
-        device="cuda:1"
-        de=DataEngine(device=device)
-        cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache"
-        text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-        de.load_cached_label(cache_path=cache_path, 
-                            data_style="grounding", 
-                            text_embed_pt=text_embed_pt)
+    elif DATA_NAME == "mixed_grounding":
+        # set gpu 3
+        device = "cuda:1"
+        de = DataEngine(device=device)
+        cache_path = (
+            "/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache"
+        )
+        text_embed_pt = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
+        de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
         de.load_yoloe()
 
-        batch_size=32
-        for start in tqdm(range(1000,len(de),batch_size)):
-            batch_indices=list(range(start,min(start+batch_size,len(de))))
-            batch_texts=[]
+        batch_size = 32
+        for start in tqdm(range(1000, len(de), batch_size)):
+            batch_indices = list(range(start, min(start + batch_size, len(de))))
+            batch_texts = []
             for indice in batch_indices:
                 label_texts = de.labels[indice].get("texts", [])
                 if isinstance(label_texts, list):
@@ -583,7 +560,7 @@ def visual_and_save2(self, indice=None,filename=None,
                 unique_texts = list(dict.fromkeys(batch_texts))
                 de.set_classes(name_list=unique_texts)
             else:
-                de.set_classes(name_list=None)                      
+                de.set_classes(name_list=None)
 
             # debug_indice = batch_indices[10]
             # de.visual_and_save2(debug_indice, save_path="./visualized_grounding_example.jpg")
@@ -593,26 +570,23 @@ def visual_and_save2(self, indice=None,filename=None,
                 print(f"Error processing batch starting at index {start}: {e}")
             # de.visual_and_save2(debug_indice, save_path="./visualized_grounding_example1.jpg")
 
-
         de.save_cached_label(save_path=cache_path.replace(".cache", ".updated.cache"))
 
-    elif DATA_NAME=="flickr":
-
-
+    elif DATA_NAME == "flickr":
         # set gpu 2
-        device="cuda:2"
-        de=DataEngine(device=device)
-        cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache"
-        text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-        de.load_cached_label(cache_path=cache_path, 
-                            data_style="grounding", 
-                            text_embed_pt=text_embed_pt)
+        device = "cuda:2"
+        de = DataEngine(device=device)
+        cache_path = (
+            "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache"
+        )
+        text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
+        de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
         de.load_yoloe()
 
-        batch_size=128
-        for start in tqdm(range(1000,len(de),batch_size)):
-            batch_indices=list(range(start,min(start+batch_size,len(de))))
-            batch_texts=[]
+        batch_size = 128
+        for start in tqdm(range(1000, len(de), batch_size)):
+            batch_indices = list(range(start, min(start + batch_size, len(de))))
+            batch_texts = []
             for indice in batch_indices:
                 label_texts = de.labels[indice].get("texts", [])
                 if isinstance(label_texts, list):
@@ -633,5 +607,4 @@ def visual_and_save2(self, indice=None,filename=None,
             de.grounding_predict_and_update_labels_batch(batch_indices, iou=0.1, conf=0.1)
             # de.visual_and_save2(debug_indice, save_path="./visualized_grounding_example1.jpg")
 
-
         de.save_cached_label(save_path=cache_path.replace(".cache", ".updated.cache"))
diff --git a/data_engine_agent.py b/data_engine_agent.py
index db84554..c43f342 100644
--- a/data_engine_agent.py
+++ b/data_engine_agent.py
@@ -1,25 +1,26 @@
-from git import List
-from matplotlib.pylab import sample
-import ultralytics,os
+from __future__ import annotations
+
+import os
+
+import ultralytics
+
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
 
 
-from collections import defaultdict
+import copy
 import json
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from tqdm import tqdm
+import multiprocessing as mp
 import os
-import numpy as np
-from pathlib import Path
 from collections import defaultdict
-import multiprocessing as mp
-from yoloe_data_engine.data_engine import DataEngine
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from pathlib import Path as _Path
 
-import copy
 import numpy as np
-from pathlib import Path as _Path
+from tqdm import tqdm
+from yoloe_data_engine.data_engine import DataEngine
 
 IMAGES_CACHE = None
 IMNAME_ANNS_CACHE = None
@@ -44,6 +45,7 @@ def to_serializable(obj):
 
 ######################## Grounding Data Loading Worker ########################
 
+
 def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name):
     """Worker invoked in subprocesses to build per-image grounding labels."""
     global IMAGES_CACHE, IMNAME_ANNS_CACHE
@@ -54,6 +56,7 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name):
         return
     from ultralytics.data.converter import merge_multi_segment
     from ultralytics.data.dataset import segments2boxes
+
     img = IMAGES_CACHE[f"{imid:d}"]
     h, w, f = img["height"], img["width"], img["file_name"]
     im_file = Path(im_dir) / f  # Use the passed im_dir
@@ -78,14 +81,14 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name):
         if box[2] <= 0 or box[3] <= 0:
             continue
         caption = ann["caption"]
-        cat_name = " ".join([caption[t[0]:t[1]] for t in ann["tokens_positive"]]).lower().strip()
+        cat_name = " ".join([caption[t[0] : t[1]] for t in ann["tokens_positive"]]).lower().strip()
         if not cat_name:
             continue
         if cat_name not in cat2id:
             cat2id[cat_name] = len(cat2id)
             texts.append([cat_name])
         cls = cat2id[cat_name]
-        box = [cls] + box.tolist()
+        box = [cls, *box.tolist()]
         if box not in bboxes:
             bboxes.append(box)
             if ann.get("segmentation") is not None:
@@ -97,8 +100,12 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name):
                     s = (np.concatenate(s, axis=0) / np.array([w, h], dtype=np.float32)).reshape(-1).tolist()
                 else:
                     s = [j for i in ann["segmentation"] for j in i]
-                    s = (np.array(s, dtype=np.float32).reshape(-1, 2) / np.array([w, h], dtype=np.float32)).reshape(-1).tolist()
-                s = [cls] + s
+                    s = (
+                        (np.array(s, dtype=np.float32).reshape(-1, 2) / np.array([w, h], dtype=np.float32))
+                        .reshape(-1)
+                        .tolist()
+                    )
+                s = [cls, *s]
                 segments.append(s)
         bboxes_xyxy.append(ann["bbox"])
     lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32)
@@ -117,9 +124,8 @@ def _load_grounding_data(buffer_dir, im_dir, imid, anns, folder_name):
         "bbox_format": "xywh",
         "texts": texts,
     }
-    def serializeLabel(label):
-
 
+    def serializeLabel(label):
         lc = copy.deepcopy(label)
         lc["im_file"] = str(lc.get("im_file", ""))
         lc["shape"] = list(lc.get("shape", []))
@@ -135,15 +141,16 @@ def serializeLabel(label):
         lc["normalized"] = bool(lc.get("normalized", True))
         lc["bbox_format"] = str(lc.get("bbox_format", "xywh"))
         return lc
+
     label_serialized = serializeLabel(label)
     # tmp_file = str(dst_file) + ".tmp"
     import json
+
     with open(dst_file, "w") as file:
         json.dump(label_serialized, file, indent=4, ensure_ascii=False)
     # os.replace(tmp_file, str(dst_file))
 
 
-
 def worker_wrapper(args):
     return _load_grounding_data(*args)
 
@@ -154,19 +161,21 @@ def init_worker(images_data, imname_anns_data):
     IMAGES_CACHE = images_data
     IMNAME_ANNS_CACHE = imname_anns_data
 
+
 ################################## multi-processing model prediction ############################################
 
-def _batch_model_predict_single_process(self,buffer_dir, im_files, **kwargs):
-    """
-    Batch model predict in a single process. This can be a method of DataEngine.
+
+def _batch_model_predict_single_process(self, buffer_dir, im_files, **kwargs):
+    """Batch model predict in a single process. This can be a method of DataEngine.
+
     Args:
         self: DataEngine instance
         buffer_dir: str, buffer directory to save results
         im_files: list of str, image file paths
-        kwargs: other keyword arguments for model.predict
+        kwargs: other keyword arguments for model.predict.
     """
     assert isinstance(self, DataEngine)
-    engine=self
+    engine = self
     dst_dir = os.path.join(buffer_dir, "model_predict")
     os.makedirs(dst_dir, exist_ok=True)
     conf = kwargs.get("conf", 0.5)
@@ -178,7 +187,9 @@ def _batch_model_predict_single_process(self,buffer_dir, im_files, **kwargs):
         print("All images have been processed, skip.")
         return
     process_img_files = [im_files[i] for i in indices]
-    results = list(engine.model.predict(process_img_files, conf=conf, iou=iou, batch=len(process_img_files), stream=True))
+    results = list(
+        engine.model.predict(process_img_files, conf=conf, iou=iou, batch=len(process_img_files), stream=True)
+    )
     print(f"Processed {len(process_img_files)} images.")
     for i, sample_index in enumerate(indices):
         sample = Sample()
@@ -188,11 +199,9 @@ def _batch_model_predict_single_process(self,buffer_dir, im_files, **kwargs):
     return
 
 
-
 def _device_predict_worker(args):
-    """
-    Worker function for multi-process model prediction on a specific device.
-    args: tuple containing (device, buffer_dir, batches, kwargs)
+    """Worker function for multi-process model prediction on a specific device. args: tuple containing (device,
+    buffer_dir, batches, kwargs).
     """
     device, buffer_dir, batches, kwargs = args
 
@@ -206,21 +215,21 @@ def _device_predict_worker(args):
     for im_files in tqdm(batches, desc=f"Device {device} processing batches"):
         _batch_model_predict_single_process(engine, buffer_dir, im_files, **worker_kwargs)
     return True
+
+
 ##############################################################################
 
 
+def _merge_prediction_to_sample_label(buffer_dir, sample_json, model_predict_json):
+    """Each sample have a file_name, we merge the model prediction results (model_predict_json) into the sample
+    grounding label. step 1: first check the filename match, if false, raise error. step 2: check the dst file
+    exist, if true, skip. step 3: merge model prediction results into sample grounding label, iou score > 0.5 will
+    be ignored. step 4: save the merged label to buffer_dir/merge_prediction/.
 
-def _merge_prediction_to_sample_label(buffer_dir,sample_json, model_predict_json):
-    """
-        Each sample have a file_name, we merge the model prediction results (model_predict_json) into the sample grounding label.
-        step 1: first check the filename match, if false, raise error.
-        step 2: check the dst file exist, if true, skip.
-        step 3: merge model prediction results into sample grounding label, iou score > 0.5 will be ignored.
-        step 4: save the merged label to buffer_dir/merge_prediction/
     Args:
         buffer_dir: str, buffer directory to save results
         sample_json: str, path to sample grounding label json file
-        model_predict_json: str, path to model prediction json file
+        model_predict_json: str, path to model prediction json file.
     """
     dst_dir = os.path.join(buffer_dir, "merge_prediction")
     os.makedirs(dst_dir, exist_ok=True)
@@ -238,7 +247,7 @@ def _merge_prediction_to_sample_label(buffer_dir,sample_json, model_predict_json
 
     for model_inst in predict_sample.instances:
         # Defensive: skip invalid model instances
-        if getattr(model_inst, 'bbox', None) is None:
+        if getattr(model_inst, "bbox", None) is None:
             print(f"[merge][WARN] skipping model instance with empty bbox in '{sample_json}'")
             continue
         try:
@@ -261,6 +270,7 @@ def _merge_prediction_to_sample_label(buffer_dir,sample_json, model_predict_json
     return True
     # print(f"Merged label saved to {dst_file}")
 
+
 def merge_prediction_worker(args):
     # Support both (idx, buffer_dir, sample_json, model_predict_json) and (buffer_dir, sample_json, model_predict_json)
     try:
@@ -280,13 +290,13 @@ def merge_prediction_worker(args):
         return _merge_prediction_to_sample_label(buffer_dir, sample_json, model_predict_json)
     except Exception as e:
         import traceback
+
         tb = traceback.format_exc()
-        print(f"[worker][ERROR] idx={idx} file='{sample_json}': {repr(e)}\n{tb}")
+        print(f"[worker][ERROR] idx={idx} file='{sample_json}': {e!r}\n{tb}")
         return False
-    
 
-##############################################################################
 
+##############################################################################
 
 
 class YoloBox:
@@ -386,6 +396,7 @@ def iou(self, bbox_xyxy):
             ious.append(iou)
         return np.array(ious)
 
+
 class Instance:
     def __init__(self, bbox=None, **kwargs):
         self.bbox = bbox
@@ -406,7 +417,7 @@ def set_embed(self, embed):
     def set_vpe(self, vpe: np.ndarray):
         self.vpe = vpe.squeeze()
 
-    def set_text(self, texts: list, conf: list = None):
+    def set_text(self, texts: list, conf: list | None = None):
         self.text = texts
         self.conf = conf
         assert len(texts) == len(conf)
@@ -418,20 +429,22 @@ def get_top_text_conf(self):
 
     def to_dict(self):
         return {
-            'bbox': to_serializable(self.bbox),
-            'text': to_serializable(self.text),
-            'conf': to_serializable(self.conf),
-            'embed': to_serializable(self.embed),
-            'vp': to_serializable(self.vpe),
-            'other_data': to_serializable(self.other_data)
+            "bbox": to_serializable(self.bbox),
+            "text": to_serializable(self.text),
+            "conf": to_serializable(self.conf),
+            "embed": to_serializable(self.embed),
+            "vp": to_serializable(self.vpe),
+            "other_data": to_serializable(self.other_data),
         }
+
     def from_dict(self, data: dict):
-        self.bbox = data.get('bbox')
-        self.text = data.get('text')
-        self.conf = data.get('conf')
-        self.embed = data.get('embed')
-        self.vpe = data.get('vpe')
-        self.other_data = data.get('other_data', {})
+        self.bbox = data.get("bbox")
+        self.text = data.get("text")
+        self.conf = data.get("conf")
+        self.embed = data.get("embed")
+        self.vpe = data.get("vpe")
+        self.other_data = data.get("other_data", {})
+
 
 class Sample:
     def __init__(self):
@@ -445,7 +458,8 @@ def load_from_grounding_label(self, grounding_data):
         if isinstance(grounding_data, str):
             assert grounding_data.endswith(".json"), "If grounding_data is str, it should be a json file path."
             import json
-            with open(grounding_data, 'r') as f:
+
+            with open(grounding_data) as f:
                 grounding_data = json.load(f)
 
         assert isinstance(grounding_data, dict), "grounding_data should be a dict"
@@ -465,7 +479,9 @@ def load_from_grounding_label(self, grounding_data):
         self.other_data["normalized"] = normalized
         assert normalized is True
         # assert bbox_format == "xywhn"
-        for cls, box, segment in zip(grounding_data.get("cls", []), grounding_data.get("bboxes", []), grounding_data.get("segments", [])):
+        for cls, box, segment in zip(
+            grounding_data.get("cls", []), grounding_data.get("bboxes", []), grounding_data.get("segments", [])
+        ):
             # Convert normalized xywh to xyxy for internal consistency
             bbox_xyxy = YoloBox(self.shape).load_from_xywhn_normalized(np.array([box], dtype=np.float32)).xyxy[0]
             # Create instance with xyxy bbox
@@ -506,14 +522,14 @@ def load_from_grounding_label(self, grounding_data):
     #     return grounding_data
 
     def load_from_yoloe_result(self, yoloe_result):
-        
         if isinstance(yoloe_result, str):
             assert yoloe_result.endswith(".json"), "If yoloe_result is str, it should be a json file path."
             import json
-            with open(yoloe_result, 'r') as f:
+
+            with open(yoloe_result) as f:
                 yoloe_result = json.load(f)
             assert isinstance(yoloe_result, dict), "yoloe_result should be a dict"
-            
+
             self.instances = []
             self.im_file = yoloe_result.get("im_file")
             self.shape = (yoloe_result.get("orig_shape", [0, 0])[0], yoloe_result.get("orig_shape", [0, 0])[1])
@@ -535,30 +551,31 @@ def load_from_yoloe_result(self, yoloe_result):
 
     def to_dict(self):
         return {
-            'im_file': to_serializable(self.im_file),
-            'instances': [inst.to_dict() for inst in self.instances],
-            'other_data': to_serializable(self.other_data)
+            "im_file": to_serializable(self.im_file),
+            "instances": [inst.to_dict() for inst in self.instances],
+            "other_data": to_serializable(self.other_data),
         }
-    
-
 
     def save_to_json(self, json_path):
         import json
-        with open(json_path, 'w') as f:
+
+        with open(json_path, "w") as f:
             json.dump(self.to_dict(), f, indent=4)
         # print(f"Saved sample to {json_path}")
 
-    def load_from_json(self,json_path):
+    def load_from_json(self, json_path):
         import json
-        with open(json_path, 'r') as f:
+
+        with open(json_path) as f:
             data = json.load(f)
-        self.im_file = data.get('im_file')
+        self.im_file = data.get("im_file")
         self.instances = []
-        for inst_data in data.get('instances', []):
+        for inst_data in data.get("instances", []):
             inst = Instance()
             inst.from_dict(inst_data)
             self.instances.append(inst)
-        self.other_data = data.get('other_data', {})
+        self.other_data = data.get("other_data", {})
+
 
 class DataEngineAgent:
     def __init__(self, devices=["cuda:0"], buffer_dir="/root/ultra_louis_work/engine_buffer"):
@@ -582,10 +599,6 @@ def set_classes(self, texts: list | None):
             model.set_classes(name_list=texts)
         self.texts = texts
 
-
-
-
-    
     def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0.4, batch_size=3, max_workers=None):
         im_files = []
         for file_name in os.listdir(im_dir):
@@ -594,7 +607,7 @@ def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0.
 
         # im_files=im_files[:128]
         print(f"Total images to process: {len(im_files)}")
-        batches = [im_files[i:i+batch_size] for i in range(0, len(im_files), batch_size)]
+        batches = [im_files[i : i + batch_size] for i in range(0, len(im_files), batch_size)]
         print(f"Total batches: {len(batches)}, batch size: {batch_size}")
         if not batches:
             return []
@@ -615,7 +628,7 @@ def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0.
             assigned_batches = batches[idx::device_count]
             if not assigned_batches:
                 continue
-            kwargs = {'conf': conf, 'iou': iou, 'texts': texts}
+            kwargs = {"conf": conf, "iou": iou, "texts": texts}
             process_args.append((device, self.buffer_dir, assigned_batches, kwargs))
 
         if not process_args:
@@ -629,13 +642,10 @@ def multi_process_batch_model_predict(self, im_dir, texts=None, conf=0.5, iou=0.
             for future in tqdm(as_completed(futures), total=len(futures), desc="Model predict ..."):
                 future.result()
         return results
-    
-
 
         # print(f"Saved sample to {dst_file}")
 
     def multi_process_load_grounding_data(self, im_dir, json_file, merge_within_one_image, max_workers=8):
-
         print("Start multi-process loading of grounding data...")
         self.im_dir = im_dir
         with open(json_file) as f:
@@ -659,7 +669,7 @@ def multi_process_load_grounding_data(self, im_dir, json_file, merge_within_one_
             imid_anns[ann["image_id"]].append(ann)
         self.img_path = annotations.get("img_path", "")
         imids = list(imid_anns.keys())
-        
+
         print(f"Total images to process: {len(imids)}")
 
         init_args = (images_data, imname_anns_data)
@@ -673,25 +683,30 @@ def multi_process_load_grounding_data(self, im_dir, json_file, merge_within_one_
             chunk_size = max(1, min(500, len(imids) // (worker_count * 4) if worker_count > 0 else 1))
             print(f"Using {worker_count} workers and chunksize: {chunk_size}")
 
-            list(tqdm(executor.map(worker_wrapper, tasks, chunksize=chunk_size), total=len(tasks), desc="Loading grounding data"))
+            list(
+                tqdm(
+                    executor.map(worker_wrapper, tasks, chunksize=chunk_size),
+                    total=len(tasks),
+                    desc="Loading grounding data",
+                )
+            )
 
         print("Finished loading grounding data.")
 
-    def multi_process_merge_prediction(self,json_dir,predict_json_dir,max_workers=8):
-        
-        json_files= []
+    def multi_process_merge_prediction(self, json_dir, predict_json_dir, max_workers=8):
+        json_files = []
         predict_json_files = []
         for sample_file_name in os.listdir(json_dir):
             if sample_file_name.endswith(".json"):
-                json_path= os.path.join(json_dir, sample_file_name)
+                json_path = os.path.join(json_dir, sample_file_name)
                 json_files.append(json_path)
 
                 # read json_path and get im_file name
-                with open(json_path, 'r') as f:
+                with open(json_path) as f:
                     sample_data = json.load(f)
                 im_file = sample_data.get("im_file")
                 im_name = os.path.splitext(os.path.basename(im_file))[0]
-                predict_json_path= os.path.join(predict_json_dir, f"{im_name}.json")
+                predict_json_path = os.path.join(predict_json_dir, f"{im_name}.json")
                 if os.path.exists(predict_json_path):
                     predict_json_files.append(predict_json_path)
                 else:
@@ -738,78 +753,72 @@ def multi_process_merge_prediction(self,json_dir,predict_json_dir,max_workers=8)
                 if total % 10000 == 0:
                     print(f"[merge] Progress: {ok}/{total} succeeded")
         print(f"[merge] Done: {ok}/{total} succeeded")
-                
 
     def _merge_predict(self):
         pass
 
 
-
-
 def read_numpy_and_print(path=None):
     def load_dataset_cache_file(path: Path) -> dict:
         import gc
+
         gc.disable()
         cache = np.load(str(path), allow_pickle=True).item()
         gc.enable()
         return cache
+
     path = "/root/ultra_louis_work/engine_buffer/grounding_data/5.cache"
     data = load_dataset_cache_file(path)
     print(data)
 
-if __name__ == "__main__":
 
-
-    devices = ["cuda:0","cuda:1","cuda:2","cuda:3"]
+if __name__ == "__main__":
+    devices = ["cuda:0", "cuda:1", "cuda:2", "cuda:3"]
 
     # agent = DataEngineAgent(devices=devices, buffer_dir="/root/ultra_louis_work/runs/flickr_engine_buffer")
     # json_file = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.json"
     # im_dir = "../datasets/flickr/full_images/"
     # mobileclip_text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
 
+    DATA = "mixed_grounding"  # "mixed_grounding"
 
-    DATA="mixed_grounding"  # "mixed_grounding"
-
-    if DATA=="flickr":
-
-
+    if DATA == "flickr":
         agent = DataEngineAgent(devices=devices, buffer_dir="/root/ultra_louis_work/runs/flickr_engine_buffer")
         json_file = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.json"
         im_dir = "../datasets/flickr/full_images/"
         # mobileclip_text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-        mobileclip_text_embed_pt=r"/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    
+        mobileclip_text_embed_pt = (
+            r"/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
+        )
+
         import torch
+
         txt_map = torch.load(mobileclip_text_embed_pt, map_location="cuda:0")
-        name_list = list   (txt_map.keys())[:50000]
+        name_list = list(txt_map.keys())[:50000]
         # agent.multi_process_batch_model_predict(im_dir=im_dir, texts=name_list, conf=0.5, iou=0.4, batch_size=2)
         # agent.multi_process_load_grounding_data(json_file=json_file, im_dir=im_dir, merge_within_one_image=True, max_workers=8)
-        agent.multi_process_merge_prediction(json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/grounding_data_merged",
-                                            predict_json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict",
-                                            max_workers=8)
-
-    elif DATA=="mixed_grounding":
-
+        agent.multi_process_merge_prediction(
+            json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/grounding_data_merged",
+            predict_json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict",
+            max_workers=8,
+        )
 
+    elif DATA == "mixed_grounding":
         agent = DataEngineAgent(devices=devices, buffer_dir="/root/ultra_louis_work/runs/mixed_engine_buffer")
-        json_file= "../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json"
-        im_dir="../datasets/mixed_grounding/gqa/images"
+        json_file = "../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json"
+        im_dir = "../datasets/mixed_grounding/gqa/images"
         mobileclip_text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-    
+
         # import torch
         # txt_map= torch.load(mobileclip_text_embed_pt, map_location="cuda:0")
         # name_list=list(txt_map.keys())[:50000]
         # agent.multi_process_batch_model_predict(im_dir=im_dir, texts=name_list, conf=0.5, iou=0.4,batch_size=2)
 
-
         # agent.multi_process_load_grounding_data(json_file=json_file, im_dir=im_dir, merge_within_one_image=True, max_workers=8)
-        agent.multi_process_merge_prediction(json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/grounding_data_merged",
-                                            predict_json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict",
-                                            max_workers=8)
+        agent.multi_process_merge_prediction(
+            json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/grounding_data_merged",
+            predict_json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict",
+            max_workers=8,
+        )
 
     # agent.multi_process_load_grounding_data(json_file=json_file, im_dir=im_dir, merge_within_one_image=True, max_workers=8)
-
-
-
-
-
diff --git a/data_visual.py b/data_visual.py
index b8f6630..c6db357 100644
--- a/data_visual.py
+++ b/data_visual.py
@@ -1,4 +1,7 @@
-import ultralytics,os
+import os
+
+import ultralytics
+
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
@@ -6,46 +9,35 @@
 
 from data_engine import DataEngine
 
-
-
-
-
 if __name__ == "__main__":
-
-
     # device="cuda:1"
     # de=DataEngine(device=device)
     # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache"
     # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    # de.load_cached_label(cache_path=cache_path, 
-    #                     data_style="grounding", 
+    # de.load_cached_label(cache_path=cache_path,
+    #                     data_style="grounding",
     #                     text_embed_pt=text_embed_pt)
 
-    im_index=0
-
+    im_index = 0
 
-    de=DataEngine()
-    cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache"
-    text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-    de.load_cached_label(cache_path=cache_path, 
-                        data_style="grounding", 
-                        text_embed_pt=text_embed_pt)
+    de = DataEngine()
+    cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache"
+    text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
+    de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
     de.load_yoloe()
     print("length of labels:", len(de.labels))
     print(de.labels[im_index]["im_file"])
 
     # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg")
 
-
-    de=DataEngine()
-    cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.updated.cache"
-    text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-    de.load_cached_label(cache_path=cache_path, 
-                        data_style="grounding", 
-                        text_embed_pt=text_embed_pt)
+    de = DataEngine()
+    cache_path = (
+        "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.updated.cache"
+    )
+    text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
+    de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
     print("length of labels:", len(de.labels))
     # de.load_yoloe()
     print(de.labels[im_index]["im_file"])
 
     # de.visual_and_save2(im_index, save_path="./visualized_grounding_example1.jpg")
-
diff --git a/data_visual_flickr.py b/data_visual_flickr.py
index 5d94a3d..b10405b 100644
--- a/data_visual_flickr.py
+++ b/data_visual_flickr.py
@@ -1,4 +1,7 @@
-import ultralytics,os
+import os
+
+import ultralytics
+
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
@@ -6,41 +9,29 @@
 
 from data_engine import DataEngine
 
-
-
-
-
 if __name__ == "__main__":
-
-
     # device="cuda:1"
     # de=DataEngine(device=device)
     # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache"
     # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    # de.load_cached_label(cache_path=cache_path, 
-    #                     data_style="grounding", 
+    # de.load_cached_label(cache_path=cache_path,
+    #                     data_style="grounding",
     #                     text_embed_pt=text_embed_pt)
 
-    im_index=0
+    im_index = 0
 
-
-    de=DataEngine()
-    cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache"
-    text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-    de.load_cached_label(cache_path=cache_path, 
-                        data_style="grounding", 
-                        text_embed_pt=text_embed_pt)
+    de = DataEngine()
+    cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache"
+    text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
+    de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
     de.print_data_info()
 
     # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg")
 
-
-    de=DataEngine()
-    cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache"
-    text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-    de.load_cached_label(cache_path=cache_path, 
-                        data_style="grounding", 
-                        text_embed_pt=text_embed_pt)
+    de = DataEngine()
+    cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache"
+    text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
+    de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
     de.print_data_info()
 
     # de.visual_and_save2(im_index, save_path="./visualized_grounding_example1.jpg")
diff --git a/data_visual_mixed.py b/data_visual_mixed.py
index 788795e..3072676 100644
--- a/data_visual_mixed.py
+++ b/data_visual_mixed.py
@@ -1,48 +1,41 @@
-import ultralytics,os
+import os
+
+import ultralytics
+
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
 
 
 from data_engine import DataEngine
-import numpy as np
-
-
-
 
 if __name__ == "__main__":
-
-
     # device="cuda:1"
     # de=DataEngine(device=device)
     # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache"
     # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    # de.load_cached_label(cache_path=cache_path, 
-    #                     data_style="grounding", 
+    # de.load_cached_label(cache_path=cache_path,
+    #                     data_style="grounding",
     #                     text_embed_pt=text_embed_pt)
 
-    im_index=0
-
+    im_index = 0
 
     # de=DataEngine()
     # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache"
     # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    # de.load_cached_label(cache_path=cache_path, 
-    #                     data_style="grounding", 
+    # de.load_cached_label(cache_path=cache_path,
+    #                     data_style="grounding",
     #                     text_embed_pt=text_embed_pt)
     # de.print_data_info()
 
     # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg")
 
-
-    de=DataEngine()
-    cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache"
-    text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    de.load_cached_label(cache_path=cache_path, 
-                        data_style="grounding", 
-                        text_embed_pt=text_embed_pt)
+    de = DataEngine()
+    cache_path = (
+        "/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache"
+    )
+    text_embed_pt = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
+    de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
     de.print_data_info()
 
     de.visual_and_save2(filename="353913.jpg", save_path="./visualized_grounding_example_v2.jpg")
-
-
diff --git a/data_visual_object365.py b/data_visual_object365.py
index d25cf2d..8a745d0 100644
--- a/data_visual_object365.py
+++ b/data_visual_object365.py
@@ -1,4 +1,7 @@
-import ultralytics,os
+import os
+
+import ultralytics
+
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
@@ -6,35 +9,28 @@
 
 from data_engine import DataEngine
 
-
-
-
-
 if __name__ == "__main__":
-
-
     # device="cuda:1"
     # de=DataEngine(device=device)
     # cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.merged.cache"
     # text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    # de.load_cached_label(cache_path=cache_path, 
-    #                     data_style="grounding", 
+    # de.load_cached_label(cache_path=cache_path,
+    #                     data_style="grounding",
     #                     text_embed_pt=text_embed_pt)
 
-    im_index=0
+    im_index = 0
 
-
-    de=DataEngine(device="cuda")
-    de=DataEngine(device="cuda")
-    yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml"
-    cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache"
+    de = DataEngine(device="cuda")
+    de = DataEngine(device="cuda")
+    yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml"
+    cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.cache"
     de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config)
     de.print_data_info()
 
     # de.visual_and_save2(im_index, save_path="./visualized_grounding_example.jpg")
-    de=DataEngine(device="cuda")
-    yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml"
-    cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache"
+    de = DataEngine(device="cuda")
+    yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml"
+    cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache"
     de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config)
 
-    de.print_data_info()
\ No newline at end of file
+    de.print_data_info()
diff --git a/do_flickr.sh b/do_flickr.sh
index 437196c..a9f1a80 100644
--- a/do_flickr.sh
+++ b/do_flickr.sh
@@ -1,17 +1,14 @@
-# activate clipenv conda env 
+# activate clipenv conda env
 source ~/miniconda3/etc/profile.d/conda.sh
 conda activate clipenv
 
-
 # remove /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A if it exists
 if [ -f /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A ]; then
-    rm /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A
+  rm /root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.cache.A
 fi
 
-
 # run the refine_text.py script to generate refined labels and cache for Flickr dataset
-python3  yoloe_data_engine/refine_text.py 
+python3 yoloe_data_engine/refine_text.py
 
 # run data visualization script
-python3  yoloe_data_engine/data_visual_flickr.py
-
+python3 yoloe_data_engine/data_visual_flickr.py
diff --git a/do_mixed.sh b/do_mixed.sh
index 898043a..f38e5c8 100644
--- a/do_mixed.sh
+++ b/do_mixed.sh
@@ -1,16 +1,10 @@
-# activate clipenv conda env 
+# activate clipenv conda env
 source ~/miniconda3/etc/profile.d/conda.sh
 conda activate clipenv
 
-
-# set gpu id to 2,3 
+# set gpu id to 2,3
 export CUDA_VISIBLE_DEVICES=2,3
 
-
-
-
 # run the refine_text.py script to generate refined labels and cache for Flickr dataset
-python3  yoloe_data_engine/refine_text.py  --img_path /root/ultra_louis_work/datasets/mixed_grounding/gqa/images \
---json_file /root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json
-
-
+python3 yoloe_data_engine/refine_text.py --img_path /root/ultra_louis_work/datasets/mixed_grounding/gqa/images \
+  --json_file /root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json
diff --git a/grounding_dataset_visualizer.py b/grounding_dataset_visualizer.py
index adc3d11..841d85d 100644
--- a/grounding_dataset_visualizer.py
+++ b/grounding_dataset_visualizer.py
@@ -1,19 +1,17 @@
+import os
 
-from ultralytics import YOLOE
-from ultralytics.models.yolo.yoloe import YOLOEVPTrainer
+import ultralytics
 
-import ultralytics,os
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
 
 
+import colorsys
+
+import cv2
 import matplotlib.pyplot as plt
-import matplotlib.patches as patches
 import numpy as np
-from PIL import Image
-import cv2
-import colorsys
 
 try:
     from ultralytics.data import GroundingDataset
@@ -23,14 +21,13 @@
 
 
 class DatasetVisualizer:
-    
     def __init__(self, *args, **kwargs):
         # Initialize with dataset parameters
         super().__init__(*args, **kwargs)
         self.colors = self._generate_colors(100)  # Generate colors for different classes
-    
+
     def _generate_colors(self, num_colors):
-        """Generate distinct colors for visualization"""
+        """Generate distinct colors for visualization."""
         colors = []
         for i in range(num_colors):
             hue = i / num_colors
@@ -38,77 +35,75 @@ def _generate_colors(self, num_colors):
             rgb = colorsys.hsv_to_rgb(hue, 0.8, 0.9)
             colors.append([int(c * 255) for c in rgb])
         return colors
-    
+
     def _convert_tensor_to_numpy(self, data):
-        """Convert PyTorch tensor to numpy array if needed"""
-        if hasattr(data, 'cpu'):  # PyTorch tensor
+        """Convert PyTorch tensor to numpy array if needed."""
+        if hasattr(data, "cpu"):  # PyTorch tensor
             return data.cpu().numpy()
         elif isinstance(data, np.ndarray):
             return data
         else:
             return np.array(data)
-    
+
     def _truncate_text(self, text, max_length=20):
-        """Truncate text for display if too long"""
+        """Truncate text for display if too long."""
         if isinstance(text, str) and len(text) > max_length:
-            return text[:max_length-3] + "..."
+            return text[: max_length - 3] + "..."
         return str(text)
-    
+
     def _draw_bbox_on_image(self, image, bbox, label, color, confidence=None):
-        """Draw bounding box and label on image"""
+        """Draw bounding box and label on image."""
         if isinstance(image, np.ndarray):
             img = image.copy()
         else:
             img = np.array(image)
-        
+
         # Convert bbox format - assuming YOLO format [center_x, center_y, width, height]
         center_x, center_y, width, height = bbox
-        
+
         # Convert to xyxy format
         x1 = center_x - width / 2
         y1 = center_y - height / 2
         x2 = center_x + width / 2
         y2 = center_y + height / 2
-        
+
         # Check if coordinates are valid
         if x1 >= x2 or y1 >= y2:
             return img
-        
+
         # Check if coordinates are within image bounds
         h, w = img.shape[:2]
-        
+
         # Convert normalized coordinates to pixel coordinates if needed
         if x2 <= 1.0:  # Normalized coordinates
             x1, x2 = x1 * w, x2 * w
             y1, y2 = y1 * h, y2 * h
-        
+
         # Ensure coordinates are within image bounds
-        x1 = max(0, min(w-1, x1))
-        y1 = max(0, min(h-1, y1))
-        x2 = max(0, min(w-1, x2))
-        y2 = max(0, min(h-1, y2))
-        
+        x1 = max(0, min(w - 1, x1))
+        y1 = max(0, min(h - 1, y1))
+        x2 = max(0, min(w - 1, x2))
+        y2 = max(0, min(h - 1, y2))
+
         # Draw rectangle
         cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
-        
+
         # Prepare label text (truncate if too long)
         label_text = self._truncate_text(label, max_length=15)
         if confidence is not None:
             label_text += f" {confidence:.2f}"
-        
+
         # Draw label background
         (text_width, text_height), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
-        cv2.rectangle(img, (int(x1), int(y1) - text_height - 10), 
-                     (int(x1) + text_width, int(y1)), color, -1)
-        
+        cv2.rectangle(img, (int(x1), int(y1) - text_height - 10), (int(x1) + text_width, int(y1)), color, -1)
+
         # Draw label text
-        cv2.putText(img, label_text, (int(x1), int(y1) - 5), 
-                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
-        
+        cv2.putText(img, label_text, (int(x1), int(y1) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
+
         return img
 
     def visualize(self, idx: int):
-        """Visualize a single sample from the dataset"""
+        """Visualize a single sample from the dataset."""
         # Get sample data
         sample = self.__getitem__(idx)
         print(f"Sample keys: {list(sample.keys()) if isinstance(sample, dict) else 'Not a dict'}")
@@ -119,22 +114,22 @@ def visualize(self, idx: int):
 
         # Extract image, bboxes, labels, and texts from the sample
         if isinstance(sample, dict):
-            image = sample.get('img', sample.get('image'))
-            bboxes = sample.get('bboxes', sample.get('bbox'))
-            labels = sample.get('cls', sample.get('labels', sample.get('classes')))
-            texts = sample.get('texts', sample.get('text', []))  # Extract text labels
+            image = sample.get("img", sample.get("image"))
+            bboxes = sample.get("bboxes", sample.get("bbox"))
+            labels = sample.get("cls", sample.get("labels", sample.get("classes")))
+            texts = sample.get("texts", sample.get("text", []))  # Extract text labels
         else:
             # If sample is a tuple/list (image, target)
             image, target = sample
             if isinstance(target, dict):
-                bboxes = target.get('bboxes', target.get('bbox'))
-                labels = target.get('cls', target.get('labels', target.get('classes')))
-                texts = target.get('texts', target.get('text', []))
+                bboxes = target.get("bboxes", target.get("bbox"))
+                labels = target.get("cls", target.get("labels", target.get("classes")))
+                texts = target.get("texts", target.get("text", []))
             else:
                 bboxes, labels, texts = target, None, []
-        
+
         print(f"Found {len(texts) if texts else 0} text labels: {texts[:3] if texts else 'None'}")
-        
+
         # Convert tensors to numpy arrays
         if image is not None:
             image = self._convert_tensor_to_numpy(image)
@@ -142,7 +137,7 @@ def visualize(self, idx: int):
             bboxes = self._convert_tensor_to_numpy(bboxes)
         if labels is not None:
             labels = self._convert_tensor_to_numpy(labels)
-        
+
         # Convert image if needed
         if isinstance(image, np.ndarray):
             if len(image.shape) == 3 and image.shape[0] == 3:  # CHW format
@@ -153,21 +148,21 @@ def visualize(self, idx: int):
             # Ensure proper data type
             if image.dtype != np.uint8:
                 image = image.astype(np.uint8)
-        
+
         # Handle different bbox and label formats
         if bboxes is not None and len(bboxes) > 0:
             # Flatten labels if they have extra dimensions
             if labels is not None and len(labels.shape) > 1:
                 labels = labels.flatten()
-            
+
             print(f"texts: {texts}")
             print(f"labels: {labels}")
-            
+
             # Draw bboxes on image
             viz_image = image.copy()
             for i, bbox in enumerate(bboxes):
                 color = self.colors[i % len(self.colors)]
-                
+
                 # Use class index to get text label from texts array
                 if labels is not None and i < len(labels) and texts:
                     class_idx = int(labels[i])
@@ -184,11 +179,11 @@ def visualize(self, idx: int):
                     label = f"class_{int(labels[i])}"
                 else:
                     label = f"obj_{i}"
-                
+
                 viz_image = self._draw_bbox_on_image(viz_image, bbox, label, color)
         else:
             viz_image = image
-        
+
         # Display using matplotlib
         plt.figure(figsize=(12, 8))
         if len(viz_image.shape) == 3:
@@ -198,53 +193,51 @@ def visualize(self, idx: int):
             else:
                 plt.imshow(viz_image)
         else:
-            plt.imshow(viz_image, cmap='gray')
+            plt.imshow(viz_image, cmap="gray")
         plt.title(f"Sample {idx}")
-        plt.axis('off')
+        plt.axis("off")
         plt.tight_layout()
         plt.show()
-        
+
         return viz_image
 
     def batch_visualize(self, indices: list):
-        """
-        Show multiple samples in matplotlib subplots
-        """
+        """Show multiple samples in matplotlib subplots."""
         n_samples = len(indices)
         if n_samples == 1:
             self.visualize(indices[0])
             return
-        
+
         # Calculate subplot layout
         cols = min(3, n_samples)
         rows = (n_samples + cols - 1) // cols
-        
-        fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 5*rows))
+
+        _fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 5 * rows))
         if rows == 1:
             axes = [axes] if cols == 1 else axes
         else:
             axes = axes.flatten() if n_samples > 1 else [axes]
-        
+
         for i, idx in enumerate(indices):
             # Get sample data
             sample = self.__getitem__(idx)
-            
+
             # Extract image, bboxes, labels, and texts from the sample
             if isinstance(sample, dict):
-                image = sample.get('img', sample.get('image'))
-                bboxes = sample.get('bboxes', sample.get('bbox'))
-                labels = sample.get('cls', sample.get('labels', sample.get('classes')))
-                texts = sample.get('texts', sample.get('text', []))  # Extract text labels
+                image = sample.get("img", sample.get("image"))
+                bboxes = sample.get("bboxes", sample.get("bbox"))
+                labels = sample.get("cls", sample.get("labels", sample.get("classes")))
+                texts = sample.get("texts", sample.get("text", []))  # Extract text labels
             else:
                 # If sample is a tuple/list (image, target)
                 image, target = sample
                 if isinstance(target, dict):
-                    bboxes = target.get('bboxes', target.get('bbox'))
-                    labels = target.get('cls', target.get('labels', target.get('classes')))
-                    texts = target.get('texts', target.get('text', []))
+                    bboxes = target.get("bboxes", target.get("bbox"))
+                    labels = target.get("cls", target.get("labels", target.get("classes")))
+                    texts = target.get("texts", target.get("text", []))
                 else:
                     bboxes, labels, texts = target, None, []
-            
+
             # Convert tensors to numpy arrays
             if image is not None:
                 image = self._convert_tensor_to_numpy(image)
@@ -252,7 +245,7 @@ def batch_visualize(self, indices: list):
                 bboxes = self._convert_tensor_to_numpy(bboxes)
             if labels is not None:
                 labels = self._convert_tensor_to_numpy(labels)
-            
+
             # Convert image if needed
             if isinstance(image, np.ndarray):
                 if len(image.shape) == 3 and image.shape[0] == 3:  # CHW format
@@ -263,18 +256,18 @@ def batch_visualize(self, indices: list):
                 # Ensure proper data type
                 if image.dtype != np.uint8:
                     image = image.astype(np.uint8)
-            
+
             # Handle different bbox and label formats
             if bboxes is not None and len(bboxes) > 0:
                 # Flatten labels if they have extra dimensions
                 if labels is not None and len(labels.shape) > 1:
                     labels = labels.flatten()
-                
+
                 # Draw bboxes on image
                 viz_image = image.copy()
                 for j, bbox in enumerate(bboxes):
                     color = self.colors[j % len(self.colors)]
-                    
+
                     # Use class index to get text label from texts array
                     if labels is not None and j < len(labels) and texts:
                         class_idx = int(labels[j])
@@ -291,11 +284,11 @@ def batch_visualize(self, indices: list):
                         label = f"class_{int(labels[j])}"
                     else:
                         label = f"obj_{j}"
-                    
+
                     viz_image = self._draw_bbox_on_image(viz_image, bbox, label, color)
             else:
                 viz_image = image
-            
+
             # Display in subplot
             ax = axes[i] if n_samples > 1 else axes[0]
             if len(viz_image.shape) == 3:
@@ -305,139 +298,130 @@ def batch_visualize(self, indices: list):
                 else:
                     ax.imshow(viz_image)
             else:
-                ax.imshow(viz_image, cmap='gray')
+                ax.imshow(viz_image, cmap="gray")
             ax.set_title(f"Sample {idx}")
-            ax.axis('off')
-        
+            ax.axis("off")
+
         # Hide unused subplots
         for i in range(n_samples, len(axes)):
-            axes[i].axis('off')
-        
+            axes[i].axis("off")
+
         plt.tight_layout()
         plt.show()
 
     def random_visualize(self, n=5):
         import random
+
         indices = random.sample(range(len(self)), n)
         self.batch_visualize(indices)
-    
+
     def save_visualization(self, idx: int, save_path: str):
-        """Save visualization to file"""
+        """Save visualization to file."""
         viz_image = self.visualize(idx)
         if viz_image is not None:
             cv2.imwrite(save_path, viz_image)
             print(f"Visualization saved to {save_path}")
-    
+
     def get_dataset_info(self):
-        """Get basic information about the dataset"""
+        """Get basic information about the dataset."""
         try:
             print(f"Dataset length: {len(self)}")
-            
+
             # Sample first item to understand structure
             sample = self.__getitem__(0)
             print(f"Sample type: {type(sample)}")
-            
+
             if isinstance(sample, dict):
                 print(f"Sample keys: {list(sample.keys())}")
-                if 'img' in sample:
+                if "img" in sample:
                     print(f"Image shape: {sample['img'].shape}")
-                if 'bboxes' in sample:
+                if "bboxes" in sample:
                     print(f"Number of bboxes: {len(sample['bboxes'])}")
-                if 'cls' in sample:
+                if "cls" in sample:
                     print(f"Classes: {sample['cls']}")
             else:
                 print(f"Sample structure: {[type(x) for x in sample]}")
-                
+
         except Exception as e:
             print(f"Error getting dataset info: {e}")
-    
+
     def visualize_class_distribution(self, max_samples=1000):
-        """Visualize the distribution of classes in the dataset"""
+        """Visualize the distribution of classes in the dataset."""
         class_counts = {}
         n_samples = min(len(self), max_samples)
-        
+
         for i in range(n_samples):
             try:
                 sample = self.__getitem__(i)
                 labels = None
-                
+
                 if isinstance(sample, dict):
-                    labels = sample.get('cls', sample.get('labels', sample.get('classes')))
+                    labels = sample.get("cls", sample.get("labels", sample.get("classes")))
                 else:
                     _, target = sample
                     if isinstance(target, dict):
-                        labels = target.get('cls', target.get('labels', target.get('classes')))
-                
+                        labels = target.get("cls", target.get("labels", target.get("classes")))
+
                 if labels is not None:
                     # Convert tensor to numpy if needed
                     labels = self._convert_tensor_to_numpy(labels)
                     # Flatten if needed
                     if len(labels.shape) > 1:
                         labels = labels.flatten()
-                    
+
                     for label in labels:
                         class_counts[str(int(label))] = class_counts.get(str(int(label)), 0) + 1
-                        
+
             except Exception as e:
                 print(f"Error processing sample {i}: {e}")
                 continue
-        
+
         # Plot distribution
         if class_counts:
             plt.figure(figsize=(12, 6))
             classes = list(class_counts.keys())
             counts = list(class_counts.values())
-            
+
             plt.bar(classes[:20], counts[:20])  # Show top 20 classes
-            plt.xlabel('Class')
-            plt.ylabel('Count')
-            plt.title('Class Distribution (Top 20)')
+            plt.xlabel("Class")
+            plt.ylabel("Count")
+            plt.title("Class Distribution (Top 20)")
             plt.xticks(rotation=45)
             plt.tight_layout()
             plt.show()
-            
+
             print(f"Total classes found: {len(class_counts)}")
             print(f"Most frequent classes: {sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:10]}")
         else:
             print("No class information found in dataset")
 
 
-
-
-
 class GroundingDatasetVisualizer(GroundingDataset, DatasetVisualizer):
-
     pass
-    
 
 
 if __name__ == "__main__":
-
-
-
-    img_path="../datasets/mixed_grounding/gqa/images",
-    json_file="../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json",
+    img_path = ("../datasets/mixed_grounding/gqa/images",)
+    json_file = ("../datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.json",)
 
     visualizer = GroundingDatasetVisualizer(json_file=json_file, img_path=img_path, augment=False)
 
     # Get dataset information
     print("=== Dataset Information ===")
     visualizer.get_dataset_info()
-    
+
     # Visualize single sample to test text labls
     print("\n=== Single Sample Visualization ===")
     # visualizer.visualize(1)
-    
+
     # Uncomment these for more visualizations:
     # print("\n=== Batch Visualization ===")
-    visualizer.batch_visualize([0,1,2,3,4])
-    
+    visualizer.batch_visualize([0, 1, 2, 3, 4])
+
     # Random visualization
     print("\n=== Random Visualization ===")
     # visualizer.random_visualize(n=3)
-    
+
     # Class distribution analysis
     print("\n=== Class Distribution ===")
     # visualizer.visualize_class_distribution(max_samples=100)
-    
-
diff --git a/log.md b/log.md
index 50c2f64..3ea4ad4 100644
--- a/log.md
+++ b/log.md
@@ -1,8 +1,4 @@
-
-
-
-
-# flickr 
+# flickr
 
 ```
 Load text embed from: /root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt
@@ -18,10 +14,9 @@ Total number of boxes: 638214
 
 # Object365
 
-
 ```
 
-(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_object365.py 
+(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_object365.py
 set workspace: /root/ultra_louis_work/ultralytics
 set workspace: /root/ultra_louis_work/ultralytics
 608606
@@ -36,8 +31,7 @@ Total number of boxes: 15518179
 
 # mixed
 
-
-(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_mixed.py 
+(clipenv) root@autodl-container-45e441a24a-850d01e8:~/ultra_louis_work/data_engine# python data_visual_mixed.py
 set workspace: /root/ultra_louis_work/ultralytics
 set workspace: /root/ultra_louis_work/ultralytics
 46380
@@ -48,4 +42,4 @@ Total number of boxes: 2245337
 46380
 Load text embed from: /root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt
 Data style: grounding
-Total number of labels: 46380
\ No newline at end of file
+Total number of labels: 46380
diff --git a/readme.md b/readme.md
index 0802e8c..b1cd38e 100644
--- a/readme.md
+++ b/readme.md
@@ -1,33 +1,27 @@
+# pipeline of the data engine
 
+### read the grounding data from the JSON file
 
-
-#  pipline of the data engine
-###   read the grounding data from the JSON file
     for each sample, per-store the others samples sharing the same image.
-    add the 
-
-###   model predict and save the JSON files
--     visual to check the json files
--  found that some boxes are overlapped heavily, with different text
-- how to deal with these boxes? 
-
-###  merge model prediction to label,
-- discard the bbox with higher iou  ( > 0.8, higher iou, no consider the class or text)
+    add the
 
+### model predict and save the JSON files
 
+-     visual to check the json files
+- found that some boxes are overlapped heavily, with different text
+- how to deal with these boxes?
 
+### merge model prediction to label,
 
+- discard the bbox with higher iou ( > 0.8, higher iou, no consider the class or text)
 
--    generate the visual prompt embedding for each instance (bbox)
-
-
--    merge bboxes within the same image ( consider the vpe distance  and text similarity,bbox iou<0.8 )
-
+- generate the visual prompt embedding for each instance (bbox)
 
--    transfer to grounding format cache for training
+- merge bboxes within the same image ( consider the vpe distance and text similarity,bbox iou<0.8 )
 
-- 
+- transfer to grounding format cache for training
 
+-
 
 to do:
 
diff --git a/refine_text.py b/refine_text.py
index 85b401d..1218f66 100644
--- a/refine_text.py
+++ b/refine_text.py
@@ -1,40 +1,44 @@
-import ultralytics,os
+import os
+
+import ultralytics
+
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
 
 
-from numpy import indices
-from pyparsing import annotations
-from data_engine import *
-
-from ultralytics.data.dataset import GroundingDataset, DATASET_CACHE_VERSION, save_dataset_cache_file, get_hash,CACHE_SUFFIX,segments2boxes
-from ultralytics.data.converter import merge_multi_segment
+import json
 from collections import defaultdict
-from ultralytics.utils import LOCAL_RANK, LOGGER, NUM_THREADS, TQDM, colorstr
 from typing import Any
-import json
-from ultralytics.models import yolo
 
+from ultralytics.data.converter import merge_multi_segment
+from ultralytics.data.dataset import (
+    DATASET_CACHE_VERSION,
+    GroundingDataset,
+    get_hash,
+    save_dataset_cache_file,
+    segments2boxes,
+)
+from ultralytics.models import yolo
+from ultralytics.utils import LOGGER, TQDM
 
-class RefineGroundingDataset(GroundingDataset, DataEngine):
+from data_engine import *
 
 
+class RefineGroundingDataset(GroundingDataset, DataEngine):
+    def vpe_text(self, source, visual_prompts, texts):
+        """Cal the visual prompt embedding for the current image and visual prompts.
 
-    def vpe_text(self, source, visual_prompts ,texts):
-        """
-        cal the visual prompt embedding for the current image and visual prompts.
         Args:
             source: image source
             visual_prompts: dict, containing "bboxes" and "cls" lists
             texts: list of str, the texts to be matched
+
         Returns:
-            matched texts for each box: tensor, (N,)
+            matched texts for each box: tensor, (N,).
         """
-
-
-        yoloe_model= self.model
-        predictor=yolo.yoloe.YOLOEVPDetectPredictor
+        yoloe_model = self.model
+        predictor = yolo.yoloe.YOLOEVPDetectPredictor
         if type(yoloe_model.predictor) is not predictor:
             yoloe_model.predictor = predictor(
                 overrides={
@@ -46,12 +50,10 @@ def vpe_text(self, source, visual_prompts ,texts):
                 _callbacks=yoloe_model.callbacks,
             )
         # self.task = "segment" if isinstance(self.predictor, yolo.segment.SegmentationPredictor) else "detect"
-            
 
-                # get the vpe from current image and visual prompts
-        prompts={"bboxes": visual_prompts["bboxes"],
-                    "cls":list( range( len(visual_prompts["cls"])))}
-        num_cls= len(set(prompts["cls"]))
+        # get the vpe from current image and visual prompts
+        prompts = {"bboxes": visual_prompts["bboxes"], "cls": list(range(len(visual_prompts["cls"])))}
+        num_cls = len(set(prompts["cls"]))
         yoloe_model.model.model[-1].nc = num_cls
         yoloe_model.model.model[-1].no = num_cls + yoloe_model.model.model[-1].reg_max * 4
         yoloe_model.model.names = [f"object{i}" for i in range(num_cls)]
@@ -59,24 +61,19 @@ def vpe_text(self, source, visual_prompts ,texts):
         yoloe_model.predictor.setup_model(model=yoloe_model.model)
         vpe = yoloe_model.predictor.get_vpe(source).squeeze(0)
 
-        tpe= yoloe_model.get_text_pe(texts).squeeze(0)
+        tpe = yoloe_model.get_text_pe(texts).squeeze(0)
 
         # normalize
-        vpe= torch.nn.functional.normalize(vpe,dim=-1,p=2)
-        tpe= torch.nn.functional.normalize(tpe,dim=-1,p=2)
+        vpe = torch.nn.functional.normalize(vpe, dim=-1, p=2)
+        tpe = torch.nn.functional.normalize(tpe, dim=-1, p=2)
         # cal the similarity and return the text for each box
         similarities = (vpe @ tpe.T).softmax(dim=-1)  # (N, M)
         matched_indices = similarities.argmax(dim=-1)  # (N,)
         matched_texts = [texts[i] for i in matched_indices.tolist()]
         return matched_texts
 
-
-
-    
-
     def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
-        """
-        Load annotations from a JSON file, filter, and normalize bounding boxes for each image.
+        """Load annotations from a JSON file, filter, and normalize bounding boxes for each image.
 
         Args:
             path (Path): Path where to save the cache file.
@@ -89,7 +86,6 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
         with open(self.json_file) as f:
             annotations = json.load(f)
 
-
         # images = {f"{im['id']:d}": im for im in annotations["images"]}
 
         # Map image IDs to file names
@@ -101,9 +97,6 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
             imid = ann["image_id"]
             imname = imid_imname[f"{imid:d}"]
             imname_anns[imname].append(ann)
-        
-
-
 
         # # map sample id to the annotations
         # img_ids= [im["id"] for im in annotations["images"]]
@@ -116,13 +109,10 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
         for ann in annotations["annotations"]:
             imid_anns[ann["image_id"]].append(ann)
 
-
-        if not hasattr(self, 'model') or self.model is None:
+        if not hasattr(self, "model") or self.model is None:
             self.load_yoloe()
 
-
         for img_id, anns in TQDM(imid_anns.items(), desc=f"Reading annotations {self.json_file}"):
-
             # if img_id > 16*10: break  # for testing
             img = images[f"{img_id:d}"]
             h, w, f = img["height"], img["width"], img["file_name"]
@@ -130,18 +120,19 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
             if not im_file.exists():
                 continue
             self.im_files.append(str(im_file))
-            bboxes_xyxy=[]
+            bboxes_xyxy = []
             bboxes = []
             segments = []
             cat2id = {}
             texts = []
 
-
-            anns_for_img=imname_anns[f]
+            anns_for_img = imname_anns[f]
 
             for ann in anns + anns_for_img:
-
-                if len(bboxes_xyxy) > 0 and YoloBox([int(h),int(w)]).load_from_xyxy(bboxes_xyxy).iou(ann["bbox"]).max()>0.98:
+                if (
+                    len(bboxes_xyxy) > 0
+                    and YoloBox([int(h), int(w)]).load_from_xyxy(bboxes_xyxy).iou(ann["bbox"]).max() > 0.98
+                ):
                     # print("skip duplicate box")
                     continue
                 if ann["iscrowd"]:
@@ -162,7 +153,7 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
                     cat2id[cat_name] = len(cat2id)
                     texts.append([cat_name])
                 cls = cat2id[cat_name]  # class
-                box = [cls] + box.tolist()
+                box = [cls, *box.tolist()]
                 if box not in bboxes:
                     bboxes.append(box)
                     if ann.get("segmentation") is not None:
@@ -179,10 +170,9 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
                                 .reshape(-1)
                                 .tolist()
                             )
-                        s = [cls] + s
+                        s = [cls, *s]
                         segments.append(s)
-                bboxes_xyxy.append(ann["bbox"]) # add xyxy box for iou calculation
-
+                bboxes_xyxy.append(ann["bbox"])  # add xyxy box for iou calculation
 
             lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32)
 
@@ -192,32 +182,29 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
                 lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
             lb = np.array(lb, dtype=np.float32)
 
-            label=  {
-                    "im_file": im_file,
-                    "shape": (h, w),
-                    "cls": lb[:, 0:1],  # n, 1
-                    "bboxes": lb[:, 1:],  # n, 4
-                    "segments": segments,
-                    "normalized": True,
-                    "bbox_format": "xywh",
-                    "texts": texts,
-                }
+            label = {
+                "im_file": im_file,
+                "shape": (h, w),
+                "cls": lb[:, 0:1],  # n, 1
+                "bboxes": lb[:, 1:],  # n, 4
+                "segments": segments,
+                "normalized": True,
+                "bbox_format": "xywh",
+                "texts": texts,
+            }
 
             #
 
-
             x["labels"].append(label)
 
+        #######  append boxes
 
-        
-        #######  append boxes 
-
-        batch_size=64
+        batch_size = 64
 
-        self.data_style="grounding"
-        for start in tqdm(range(0,len(x["labels"]),batch_size)):
-            batch_indices=list(range(start,min(start+batch_size,len(x["labels"]))))
-            batch_texts=[]
+        self.data_style = "grounding"
+        for start in tqdm(range(0, len(x["labels"]), batch_size)):
+            batch_indices = list(range(start, min(start + batch_size, len(x["labels"]))))
+            batch_texts = []
             for indice in batch_indices:
                 label_texts = x["labels"][indice].get("texts", [])
                 if isinstance(label_texts, list):
@@ -233,66 +220,58 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
             else:
                 self.set_classes(name_list=None)
 
-            results=self.yoloe_predict_batch([ x["labels"][i] for i in batch_indices ], conf=0.1,iou=0.4)
-            assert len(results)==len(batch_indices), "Mismatch between results and batch_indices length"
-            for indice,res in zip(batch_indices,results):
-                iou=0.1 # append new boxes when iou < 0.1
-                replace=False # do not replace existing boxes
-                x["labels"][indice]= self._update_grounding_label(x["labels"][indice],res,iou=iou,replace=replace)
-        
-
+            results = self.yoloe_predict_batch([x["labels"][i] for i in batch_indices], conf=0.1, iou=0.4)
+            assert len(results) == len(batch_indices), "Mismatch between results and batch_indices length"
+            for indice, res in zip(batch_indices, results):
+                iou = 0.1  # append new boxes when iou < 0.1
+                replace = False  # do not replace existing boxes
+                x["labels"][indice] = self._update_grounding_label(x["labels"][indice], res, iou=iou, replace=replace)
 
         self.load_yoloe()  # reload to reset class number
 
         #####  refine the bbox texts
         imname_image = {im["file_name"]: im for im in annotations["images"]}
-        for indice,label in tqdm(enumerate(x["labels"]), desc="Refining texts for grounding data"):
-            bboxes_xyxy= YoloBox((int(label["shape"][0]),int(label["shape"][1]))).load_from_xywhn_normalized(label["bboxes"]).xyxy
-            visual={"bboxes": bboxes_xyxy,
-                    "cls": list(range(bboxes_xyxy.shape[0]))}
-            texts= []
+        for indice, label in tqdm(enumerate(x["labels"]), desc="Refining texts for grounding data"):
+            bboxes_xyxy = (
+                YoloBox((int(label["shape"][0]), int(label["shape"][1])))
+                .load_from_xywhn_normalized(label["bboxes"])
+                .xyxy
+            )
+            visual = {"bboxes": bboxes_xyxy, "cls": list(range(bboxes_xyxy.shape[0]))}
+            texts = []
             for text_list in label["texts"]:
                 texts.extend(text_list)
-            print("original texts for image ",  ":", texts)
-            caption= imname_image[label["im_file"].name]["caption"].replace(".","")
-            caption_texts= caption.split()
+            print("original texts for image ", ":", texts)
+            caption = imname_image[label["im_file"].name]["caption"].replace(".", "")
+            caption_texts = caption.split()
             texts.extend(caption_texts)
-            print("caption_texts for image ",  ":", caption_texts)
-            texts= list(set(texts))
-            matched_texts= self.vpe_text(source= label["im_file"], visual_prompts= visual, texts= texts)
-            matches_texts_set= list(set(matched_texts))
-            label['texts']= [[text] for text in matches_texts_set]
+            print("caption_texts for image ", ":", caption_texts)
+            texts = list(set(texts))
+            matched_texts = self.vpe_text(source=label["im_file"], visual_prompts=visual, texts=texts)
+            matches_texts_set = list(set(matched_texts))
+            label["texts"] = [[text] for text in matches_texts_set]
             # take cls as the index in the matched texts set
-            label["cls"]= [   matches_texts_set.index(text) for text in matched_texts ]
-
+            label["cls"] = [matches_texts_set.index(text) for text in matched_texts]
 
-
-            print(label['cls'])
+            print(label["cls"])
             print(matched_texts)
-            print(label['texts'])
-            x["labels"][indice]= label
-
-
+            print(label["texts"])
+            x["labels"][indice] = label
 
         x["hash"] = get_hash(self.json_file)
 
-
         save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
         return x
 
 
-
-
-
-
 # """
 # refine the text for grounding data by running grounding prediction and updating the texts.
 # how to set classes:
 # 1. collect all texts in the current batch.
-# 2. do such refinement for each image in the batch: 
+# 2. do such refinement for each image in the batch:
 
 
-# # read from the json file as 
+# # read from the json file as
 
 # ["two people"] -> ["two", "people","two people"]
 # ["what"]. -> update according to the grounding prediction results.
@@ -301,12 +280,10 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
 # """
 
 
-
-
 # def load_src_json(self, json_path):
 
 #     """
-#     read the original json file for grounding dataset. 
+#     read the original json file for grounding dataset.
 
 #     """
 #     import json
@@ -333,18 +310,18 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
 #     print("number of annotations:", len(data["annotations"]))
 #     print("-"*40)
 #     print("example image entry:", data["images"][0])
-#     print("exmple category entry:")
+#     print("example category entry:")
 
 #     for i in range(4):
 #         print("-"*40)
-#         print( data["annotations"][i]) 
+#         print( data["annotations"][i])
 
 
-#     return 
+#     return
 #     json_data = {
 #         "file_names": [],
 #         "images": {},
-#         "annotations": {} 
+#         "annotations": {}
 #     }
 #     for x in data["images"]:
 #         file_name = x["file_name"]
@@ -355,7 +332,7 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
 #         print
 
 #     self.json_data = json_data
-    
+
 # def get_captions_texts(self,file_name):
 #     """
 #      get all captions and split them into texts
@@ -373,15 +350,14 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
 #     return captions, caption_texts
 
 
-
 # de=DataEngine()
 
 # load_src_json(de, "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.json")
 
 # cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.merged.cache"
 # text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-# # de.load_cached_label(cache_path=cache_path, 
-# #                     data_style="grounding", 
+# # de.load_cached_label(cache_path=cache_path,
+# #                     data_style="grounding",
 # #                     text_embed_pt=text_embed_pt)
 
 # # de.load_yoloe()
@@ -392,32 +368,22 @@ def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
 # # def predict_and_update_text(self,indice):
 
 
+DATA_DIR = "../datasets/"
 
-from ultralytics import YOLOE
-from ultralytics.models.yolo.yoloe import YOLOEVPTrainer
-
-
-
-
-
-DATA_DIR="../datasets/"
-
-Objects365v1="../datasets/Objects365v1.yaml"
+Objects365v1 = "../datasets/Objects365v1.yaml"
 
 
 import argparse
+
 parser = argparse.ArgumentParser()
-parser.add_argument('--img_path', type=str, default=DATA_DIR+"flickr/full_images/")
-parser.add_argument('--json_file', type=str, default=DATA_DIR+"flickr/annotations/final_flickr_separateGT_train_segm.json")
+parser.add_argument("--img_path", type=str, default=DATA_DIR + "flickr/full_images/")
+parser.add_argument(
+    "--json_file", type=str, default=DATA_DIR + "flickr/annotations/final_flickr_separateGT_train_segm.json"
+)
 args = parser.parse_args()
 
 
-
-data= RefineGroundingDataset(
-        img_path=args.img_path,
-        json_file=args.json_file,
-    )
-
-
-
-
+data = RefineGroundingDataset(
+    img_path=args.img_path,
+    json_file=args.json_file,
+)
diff --git a/remove_segment.py b/remove_segment.py
index 1f554d5..a6a3ac7 100644
--- a/remove_segment.py
+++ b/remove_segment.py
@@ -1,28 +1,26 @@
 from data_engine import DataEngine
-if __name__=="__main__":
-        
-    de=DataEngine(device="cuda")
-    yaml_config="/root/ultra_louis_work/datasets/Objects365v1.yaml"
-    cache_path="/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache"
-    de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config)    
+
+if __name__ == "__main__":
+    de = DataEngine(device="cuda")
+    yaml_config = "/root/ultra_louis_work/datasets/Objects365v1.yaml"
+    cache_path = "/root/ultra_louis_work/datasets/Objects365v1/labels/train.updated.cache"
+    de.load_cached_label(cache_path=cache_path, data_style="detection", yaml_config=yaml_config)
     de.remove_masks_and_segments()
     de.save_cached_label(save_path=cache_path)
 
-
-    de=DataEngine()
-    cache_path="/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache"
-    text_embed_pt="/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
-    de.load_cached_label(cache_path=cache_path,  data_style="grounding", 
-                        text_embed_pt=text_embed_pt)
+    de = DataEngine()
+    cache_path = (
+        "/root/ultra_louis_work/datasets/mixed_grounding/annotations/final_mixed_train_no_coco_segm.updated.cache"
+    )
+    text_embed_pt = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/text_embeddings_mobileclip_blt.pt"
+    de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
 
     de.remove_masks_and_segments()
     de.save_cached_label(save_path=cache_path)
 
-    de=DataEngine()
-    cache_path="/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache"
-    text_embed_pt="/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
-    de.load_cached_label(cache_path=cache_path, 
-                        data_style="grounding", 
-                        text_embed_pt=text_embed_pt)
+    de = DataEngine()
+    cache_path = "/root/ultra_louis_work/datasets/flickr/annotations/final_flickr_separateGT_train_segm.updated.cache"
+    text_embed_pt = "/root/ultra_louis_work/datasets/flickr/text_embeddings_mobileclip_blt.pt"
+    de.load_cached_label(cache_path=cache_path, data_style="grounding", text_embed_pt=text_embed_pt)
     de.remove_masks_and_segments()
     de.save_cached_label(save_path=cache_path)
diff --git a/utils.py b/utils.py
index 93f2543..9e4c71a 100644
--- a/utils.py
+++ b/utils.py
@@ -1,11 +1,8 @@
-import os
 from pathlib import Path
 
 
 def get_img_num(folder):
-    """
-    calculate the number of images in a folder
-    """
+    """Calculate the number of images in a folder."""
     img_suffix = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"]
     img_num = 0
     for suffix in img_suffix:
@@ -13,33 +10,25 @@ def get_img_num(folder):
     return img_num
 
 
-
 def get_json_num(folder):
-    """
-    calculate the number of json files in a folder
-    """
+    """Calculate the number of json files in a folder."""
     json_num = len(list(Path(folder).rglob("*.json")))
     return json_num
 
 
-
-flickr_res_json_dir="/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict"
+flickr_res_json_dir = "/root/ultra_louis_work/runs/flickr_engine_buffer/model_predict"
 
 print("number of json files:", get_json_num(flickr_res_json_dir))
 
-flickr_img_dir="/root/ultra_louis_work/datasets/flickr/full_images"
+flickr_img_dir = "/root/ultra_louis_work/datasets/flickr/full_images"
 print("number of flickr images:", get_img_num(flickr_img_dir))
 
 
-
-mixed_img_dir="/root/ultra_louis_work/datasets/mixed_grounding/gqa/images"
+mixed_img_dir = "/root/ultra_louis_work/datasets/mixed_grounding/gqa/images"
 
 print("number of images:", get_img_num(mixed_img_dir))
 
 
-
-mixed_res_json_dir="/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict"
+mixed_res_json_dir = "/root/ultra_louis_work/runs/mixed_engine_buffer/model_predict"
 
 print("number of json files:", get_json_num(mixed_res_json_dir))
-
-
diff --git a/visual_json.py b/visual_json.py
index 5d86e8e..790440c 100644
--- a/visual_json.py
+++ b/visual_json.py
@@ -1,15 +1,16 @@
+from __future__ import annotations
+
 import json
 import os
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any
 
 import numpy as np
 import torch
-from PIL import Image
 import ultralytics
+from PIL import Image
 from ultralytics.engine.results import Results
 
-
 workspace = os.path.dirname(os.path.dirname(os.path.abspath(ultralytics.__file__)))
 os.chdir(workspace)
 print("set workspace:", workspace)
@@ -18,9 +19,8 @@
 from data_engine_agent import Instance, Sample  # noqa: E402  pylint: disable=C0413
 
 
-def _ensure_sequence(value: Any) -> List[Any]:
+def _ensure_sequence(value: Any) -> list[Any]:
     """Normalize single values to a list while leaving iterables intact."""
-
     if value is None:
         return []
     if isinstance(value, (list, tuple)):
@@ -30,13 +30,12 @@ def _ensure_sequence(value: Any) -> List[Any]:
 
 def load_from_json(json_path: Path | str) -> Sample:
     """Load a `Sample` from a JSON file generated by the data engine."""
-
     json_path = Path(json_path)
     if not json_path.is_file():
         raise FileNotFoundError(f"Sample JSON not found: {json_path}")
 
     with json_path.open("r", encoding="utf-8") as handle:
-        payload: Dict[str, Any] = json.load(handle)
+        payload: dict[str, Any] = json.load(handle)
 
     sample = Sample()
     sample.im_file = payload.get("im_file")
@@ -78,7 +77,6 @@ def load_from_json(json_path: Path | str) -> Sample:
 
 def _resolve_image_path(sample: Sample, image_root: Path | str | None = None) -> Path:
     """Resolve the image path for a sample, considering optional root hints."""
-
     if sample.im_file is None:
         raise ValueError("Sample does not specify an image file")
 
@@ -103,25 +101,23 @@ def _resolve_image_path(sample: Sample, image_root: Path | str | None = None) ->
     raise FileNotFoundError(f"Unable to locate image file for sample: {sample.im_file}")
 
 
-def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> List[Results]:
+def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> list[Results]:
     """Convert a `Sample` into a list containing a single Ultralytics `Results` object."""
-
     img_path = _resolve_image_path(sample, image_root=image_root)
     orig_img = np.array(Image.open(img_path).convert("RGB"))
 
-    text_instances={}
+    text_instances = {}
     for inst in sample.instances:
         if inst.text[0] not in text_instances.keys():
-            text_instances[inst.text[0]]=[]
+            text_instances[inst.text[0]] = []
         text_instances[inst.text[0]].append(inst)
 
     text_result = {}
 
     for text, instances in text_instances.items():
-
-        boxes_data: List[List[float]] = []
-        names: List[str] = []
-        name_to_idx: Dict[str, int] = {}
+        boxes_data: list[list[float]] = []
+        names: list[str] = []
+        name_to_idx: dict[str, int] = {}
 
         for inst in instances:
             bbox_array = np.array(inst.bbox, dtype=np.float32).reshape(-1, 4)
@@ -137,16 +133,22 @@ def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> L
             conf_value = float(inst.conf[0]) if inst.conf else 0.0
 
             for bbox in bbox_array:
-                boxes_data.append([
-                    float(bbox[0]),
-                    float(bbox[1]),
-                    float(bbox[2]),
-                    float(bbox[3]),
-                    conf_value,
-                    cls_idx,
-                ])
-
-        boxes_tensor = torch.from_numpy(np.array(boxes_data, dtype=np.float32)) if boxes_data else torch.zeros((0, 6), dtype=torch.float32)
+                boxes_data.append(
+                    [
+                        float(bbox[0]),
+                        float(bbox[1]),
+                        float(bbox[2]),
+                        float(bbox[3]),
+                        conf_value,
+                        cls_idx,
+                    ]
+                )
+
+        boxes_tensor = (
+            torch.from_numpy(np.array(boxes_data, dtype=np.float32))
+            if boxes_data
+            else torch.zeros((0, 6), dtype=torch.float32)
+        )
         names_dict = {idx: name for idx, name in enumerate(names)}
 
         result = Results(
@@ -155,12 +157,12 @@ def sample_to_results(sample: Sample, image_root: Path | str | None = None) -> L
             names=names_dict,
             boxes=boxes_tensor,
         )
-        text_result[text]=result
+        text_result[text] = result
     return text_result
 
+
 def visualize_sample(sample: Sample, dst_vis_img: Path | str, image_root: Path | str | None = None) -> Path:
     """Render sample predictions to an image and save it to ``dst_vis_img``."""
-
     text_result = sample_to_results(sample, image_root=image_root)
     dst_vis_path = Path(dst_vis_img)
     dst_vis_path.parent.mkdir(parents=True, exist_ok=True)
@@ -183,4 +185,4 @@ def visualize_sample(sample: Sample, dst_vis_img: Path | str, image_root: Path |
     print(f"Loaded {len(sample.instances)} instances from {sample.im_file}")
     output_path = Path("../visual_json/visual_img.jpg")
     saved_path = visualize_sample(sample, output_path)
-    print(f"Saved visualization to {saved_path}")
\ No newline at end of file
+    print(f"Saved visualization to {saved_path}")