From 4bf83aa8b8ff26943f7dcb574232db940b35fa86 Mon Sep 17 00:00:00 2001 From: xcy Date: Thu, 31 Jul 2025 14:59:39 +0800 Subject: [PATCH 01/16] Implementing op parallel data processing based on Ray Actor --- data_juicer/core/data/ray_dataset.py | 325 +++++++++++++++++- data_juicer/core/ray_actor.py | 123 +++++++ data_juicer/ops/base_op.py | 21 +- .../ops/filter/video_aesthetics_filter.py | 5 +- .../video_captioning_from_frames_mapper.py | 8 +- .../process_video_on_ray/configs/pr_demo.yaml | 46 +++ 6 files changed, 517 insertions(+), 11 deletions(-) create mode 100644 data_juicer/core/ray_actor.py create mode 100644 demos/process_video_on_ray/configs/pr_demo.yaml diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index c80b3bfbf0..beb218fdfe 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -2,8 +2,13 @@ import os from functools import partial +import queue +import threading +import time from typing import Any, Dict, List, Literal, Optional, Union +import uuid +from data_juicer.core.ray_actor import Actor import pyarrow from jsonargparse import Namespace from loguru import logger @@ -18,9 +23,15 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.process_utils import calculate_np from data_juicer.utils.webdataset_utils import _custom_default_decoder - +import ray ray = LazyLoader("ray") - +from ray.util.placement_group import ( + placement_group, + placement_group_table, + remove_placement_group, +) +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from ray.data import from_items def get_abs_path(path, dataset_dir): if is_remote_path(path): @@ -92,6 +103,8 @@ class RayDataset(DJDataset): def __init__(self, dataset: ray.data.Dataset, dataset_path: str = None, cfg: Optional[Namespace] = None) -> None: self.data = preprocess_dataset(dataset, dataset_path, cfg) self.num_proc = getattr(cfg, "np", getattr(cfg, "num_proc", None)) if cfg else None + self.gpu_pg = placement_group([{"CPU": 16, "GPU": 2}], strategy="STRICT_SPREAD") + ray.get(self.gpu_pg.ready()) def schema(self) -> Schema: """Get dataset schema. @@ -142,7 +155,7 @@ def get_column(self, column: str, k: Optional[int] = None) -> List[Any]: return [row[column] for row in self.data.take()] - def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + def process2(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: if operators is None: return self if not isinstance(operators, list): @@ -150,7 +163,313 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) - for op in operators: self._run_single_op(op) return self + + def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + print("test start") + if operators is None: + return self + if not isinstance(operators, list): + operators = [operators] + add_meta = False + add_stats = False + for op in operators: + columns = self.data.columns() + if op._name in TAGGING_OPS.modules and Fields.meta not in self.data.columns(): + add_meta = True + if isinstance(op, Filter): + if Fields.stats not in columns: + add_stats = True + if add_meta: + def process_batch_arrow(table: pyarrow.Table): + new_column_data = [{} for _ in range(len(table))] + new_table = table.append_column(Fields.meta, [new_column_data]) + return new_table + + self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") + if add_stats: + def process_batch_arrow(table: pyarrow.Table): + new_column_data = [{} for _ in range(len(table))] + new_table = table.append_column(Fields.stats, [new_column_data]) + return new_table + self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") + + actors = {} + # 资源分配 + gpu_allocate = [0,1,1] + actor_allocate = [0,1,1] + cpu_allocate = 1 + bundle_allocate = [0, 0, 0] + for idx, op in enumerate(operators): + op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) + actors[op._name] = [] + + if actor_allocate[idx] > 0 : + actor_num = actor_allocate[idx] + else: + actor_num = min(op_proc, self.data.count()) + if op.use_cuda(): + num_gpus = gpu_allocate[idx] + print(f"{op._name} allocate {num_gpus} GPUs.") + + for _ in range(actor_num): # 启动多个actor + actor = Actor.options( + name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", + num_gpus=num_gpus, + num_cpus=cpu_allocate, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=self.gpu_pg, + placement_group_capture_child_tasks=True + ), + placement_group_bundle_index=bundle_allocate[idx], + ).remote(op) + + actor.load_model.remote() + actors[op._name].append(actor) + else: + num_gpus = 0 + print(f"{op._name} allocate in CPU.") + for _ in range(actor_num): # 启动多个actor + actor = Actor.options( + name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", + num_gpus=num_gpus, + num_cpus=cpu_allocate, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=self.gpu_pg, + placement_group_capture_child_tasks=True + ), + placement_group_bundle_index=bundle_allocate[idx], + ).remote(op) + actors[op._name].append(actor) + + # 打印所有actor信息 + for op_name, actor_list in actors.items(): + logger.info(f"Operator {op_name} 有以下actors:") + for i, actor in enumerate(actor_list): + logger.info(f" Actor {i}: {actor._ray_actor_id.hex()[:6]}") + + input_data = self.data.take_all() + op_buckets = {op._name: queue.Queue() for op in operators} + + # 初始数据放入第一个operator的队列 + for data_item in input_data: + op_buckets[operators[0]._name].put(data_item) + + # 添加结束标记,数量等于第一个operator的actor数量 + for _ in range(len(actors[operators[0]._name])): + op_buckets[operators[0]._name].put(None) + + # 存储每个操作的处理结果 + final_results = [] + # 为每个操作创建事件 + events = {op._name: threading.Event() for op in operators} + # 将第一个操作的事件标记为已触发 + events[operators[0]._name].set() + + # 用于跟踪每个operator的完成情况 + op_completion_count = {op._name: 0 for op in operators} + # 用于同步同一operator的多个actor + op_actor_locks = {op._name: threading.Lock() for op in operators} + + def process_operator(op_idx, op, actor): + op_name = op._name + input_queue = op_buckets[op_name] + + # 确定输出队列 + if op_idx + 1 < len(operators): + output_queue = op_buckets[operators[op_idx + 1]._name] + else: + output_queue = None # 最后一个operator + + logger.info(f"Starting processor for {op_name} actor {actor._ray_actor_id.hex()[:6]}") + + start_time = time.time() + + while True: + try: + # 等待前一个操作完成,才能开始处理当前操作 + events[op_name].wait() + + # 从输入队列获取数据,设置超时避免无限等待 + data_item = input_queue.get(timeout=10.0) + + # 检查结束标记 + if data_item is None: + with op_actor_locks[op_name]: + op_completion_count[op_name] += 1 + # 当所有actor都收到结束标记后,才传递到下一个队列 + if op_completion_count[op_name] == len(actors[op_name]) and output_queue: + # 向下一个operator传递与下一个operator的actor数量相同的结束标记 + next_op_name = operators[op_idx + 1]._name if op_idx + 1 < len(operators) else None + if next_op_name: + for _ in range(len(actors[next_op_name])): + output_queue.put(None) + break + + # 处理数据 + future = None + if isinstance(op, Mapper): + if op.use_cuda(): + if op.is_batched_op == True: + future = actor.mapper_cuda_batched.remote(self.transform_to_2d_format(data_item)) + else: + future = actor.mapper_cuda.remote(data_item) + else: + future = actor.mapper_cpu.remote(data_item) + + result = ray.get(future) + # print("res:", result) + # 将结果发送到下一个队列 + if output_queue: + output_queue.put(result) + else: + final_results.append(result) + + elif isinstance(op, Filter): + if op.use_cuda(): + if op.is_batched_op == True: + future = actor.filter_cuda_batched.remote(data_item) + else: + future = actor.filter_cuda_single.remote(data_item) + else: + if op.is_batched_op == True: + future = actor.filter_cpu_batched.remote(data_item) + else: + future = actor.filter_cpu_single.remote(data_item) + + results = ray.get(future) + + if results: + if isinstance(results, list): + for result in results: + if output_queue: + output_queue.put(result) + else: + final_results.append(result) + else: + if output_queue: + output_queue.put(results) + else: + final_results.append(results) + + if op.stats_export_path is not None: + actor.export_stats(results, op.stats_export_path) + + # 标记任务完成 + input_queue.task_done() + + # 处理完成后,设置当前操作的事件,允许下一个操作开始 + if op_idx + 1 < len(operators): + events[operators[op_idx + 1]._name].set() + + except queue.Empty: + logger.info(f"{op_name} actor {actor._ray_actor_id.hex()[:6]} queue timeout, checking if pipeline is complete") + continue + except Exception as e: + logger.error(f"Error in {op_name} actor {actor._ray_actor_id.hex()[:6]}: {e}") + input_queue.task_done() + break + + end_time = time.time() + logger.info(f"Processor for {op_name} actor {actor._ray_actor_id.hex()[:6]} completed in {end_time - start_time:.2f} seconds") + + # 为每个operator的每个actor启动处理线程 + threads = [] + for idx, op in enumerate(operators): + for actor in actors[op._name]: + thread = threading.Thread( + target=process_operator, + args=(idx, op, actor), + name=f"processor_{op._name}_{actor._ray_actor_id.hex()[:6]}" + ) + thread.daemon = True + thread.start() + threads.append(thread) + + # 等待所有线程完成 + for thread in threads: + thread.join() + print("\nfinal res:", final_results) + # 合并最终结果 + # flattened_data = list(chain.from_iterable(final_results)) + # print("\nfinal res:", flattened_data) + # self.data = from_items(flattened_data) + self.data = from_items(final_results) + return self.data + + def transform_to_2d_format(self, data): + """ + 将第二种格式的数据转换为第一种嵌套格式 + 根据 __dj__source_file__ 的唯一值来分组所有字段 + """ + print("data before trans", data) + if '__dj__source_file__' not in data: + raise ValueError("数据中必须包含 '__dj__source_file__' 字段") + + source_files = data['__dj__source_file__'] + + # 获取唯一的源文件并保持顺序 + unique_sources = [] + seen = set() + for source in source_files: + if source not in seen: + unique_sources.append(source) + seen.add(source) + + # 为每个唯一源文件创建索引映射 + source_to_indices = {} + for source in unique_sources: + source_to_indices[source] = [i for i, s in enumerate(source_files) if s == source] + + # 初始化转换后的数据结构 + transformed_data = {} + + # 遍历原数据的所有字段 + for field_name, field_value in data.items(): + if field_name == '__dj__source_file__': + # 特殊处理 __dj__source_file__ 字段 + transformed_data[field_name] = [] + for source in unique_sources: + indices = source_to_indices[source] + transformed_data[field_name].append([source] * len(indices)) + elif isinstance(field_value, list): + # 处理列表类型的字段 + transformed_data[field_name] = [] + for source in unique_sources: + indices = source_to_indices[source] + group_data = [field_value[i] for i in indices] + transformed_data[field_name].append(group_data) + elif isinstance(field_value, dict): + # 处理字典类型的字段 + transformed_data[field_name] = [] + for source in unique_sources: + indices = source_to_indices[source] + group_dict = {} + for key, values in field_value.items(): + if isinstance(values, list): + group_dict[key] = [values[i] for i in indices] + else: + # 如果值不是列表,则重复该值 + group_dict[key] = [values] * len(indices) + transformed_data[field_name].append(group_dict) + elif isinstance(field_value, str): + # 处理字符串类型的字段 + transformed_data[field_name] = [] + for source in unique_sources: + indices = source_to_indices[source] + # 对于字符串,为每个组重复该字符串 + transformed_data[field_name].append(field_value) + else: + # 处理其他类型的字段 + transformed_data[field_name] = [] + for source in unique_sources: + indices = source_to_indices[source] + # 为每个组重复该值 + transformed_data[field_name].append(field_value) + print("data after trans", transformed_data) + return transformed_data + + def _run_single_op(self, op): op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) num_gpus = get_num_gpus(op, op_proc) diff --git a/data_juicer/core/ray_actor.py b/data_juicer/core/ray_actor.py new file mode 100644 index 0000000000..bacc57b30d --- /dev/null +++ b/data_juicer/core/ray_actor.py @@ -0,0 +1,123 @@ +from functools import partial +import ray +import pyarrow + +from data_juicer.ops.base_op import Filter, Mapper +from loguru import logger + + + +def filter_batch(batch, filter_func): + mask = pyarrow.array(filter_func(batch.to_pydict())) + return batch.filter(mask) + +@ray.remote(num_gpus=0.0) +class Actor: + def __init__(self, op, rank=None): + + self.op = op + self._model_loaded = False # 标记模型是否已加载 + self.rank = rank + self.model = None + self.processor = None + + def load_model(self): + + if self.op.use_cuda() and not self._model_loaded: + + self.model, self.processor = self.op.load_model(rank=self.rank) + self._model_loaded = True + + def mapper_cuda(self, data): + if not self._model_loaded: + self.load_model() # 确保调用前模型已加载 + data = self.op.process_single(data, self.model, self.processor) + return data + + def mapper_cuda_batched(self, data): + if not self._model_loaded: + self.load_model() # 确保调用前模型已加载 + data = self.op.process_batched(data, self.model, self.processor) + return data + + def mapper_cpu(self, data): + # 处理数据 + processed_data = self.op.process_single(data) + return processed_data + + def filter_cuda_single(self, data): + if not self._model_loaded: + self.load_model() + data = self.op.compute_stats_single(data, self.model, self.processor) + keep = self.op.process_single(data) + if keep: + return data + else: + return None + + def filter_cuda_batched(self, data): + if not self._model_loaded: + self.load_model() + # data = self.op.compute_stats_batched(data, self.model, self.processor) + data = self.op.compute_stats_batched(data) + keep_mask = list(self.op.process_batched(data)) # 将map对象转换为列表 + + # 如果没有数据需要保留,返回None + if not any(keep_mask): + return None + + # 根据掩码过滤数据 + if isinstance(data, dict): + # 如果data是字典(假设每个key对应一个列表) + filtered_data = { + key: [value for value, keep in zip(values, keep_mask) if keep] + for key, values in data.items() + } + elif isinstance(data, list): + # 如果data是列表 + filtered_data = [item for item, keep in zip(data, keep_mask) if keep] + else: + # 其他情况(如Ray Dataset的批处理) + raise ValueError("Unsupported data type for batch filtering") + + return filtered_data + + + def filter_cpu_single(self, data): + data = self.op.compute_stats_single(data) + keep = self.op.process_single(data) + if keep: + return data + else: + return None + + def filter_cpu_batched(self, data): + # data = self.op.compute_stats_batched(data, self.model, self.processor) + data = self.op.compute_stats_batched(data) + keep_mask = list(self.op.process_batched(data)) # 将map对象转换为列表 + + # 如果没有数据需要保留,返回None + if not any(keep_mask): + return None + + # 根据掩码过滤数据 + if isinstance(data, dict): + # 如果data是字典(假设每个key对应一个列表) + filtered_data = { + key: [value for value, keep in zip(values, keep_mask) if keep] + for key, values in data.items() + } + elif isinstance(data, list): + # 如果data是列表 + filtered_data = [item for item, keep in zip(data, keep_mask) if keep] + else: + # 其他情况(如Ray Dataset的批处理) + raise ValueError("Unsupported data type for batch filtering") + + return filtered_data + + + def export_stats(self, data, export_path): + + return data.write_json(export_path, force_ascii=False) + diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 761d1e3794..c46e677870 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,5 +1,6 @@ import copy from functools import wraps +import time import numpy as np import pyarrow as pa @@ -8,7 +9,7 @@ from data_juicer import is_cuda_available from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import size_to_bytes -from data_juicer.utils.model_utils import free_models +from data_juicer.utils.model_utils import free_models, get_model from data_juicer.utils.process_utils import calculate_np from data_juicer.utils.registry import Registry @@ -19,6 +20,10 @@ ATTRIBUTION_FILTERS = Registry("Attribution Filters") +import pytz +from datetime import datetime +beijing_tz = pytz.timezone('Asia/Singapore') + def convert_list_dict_to_dict_list(samples): # reconstruct samples from "list of dicts" to "dict of lists" keys = samples[0].keys() @@ -285,7 +290,19 @@ def add_index(sample, idx): def empty_history(self): return np.empty((0, 0), dtype=str) - + def load_model(self, rank=None): + start = time.time() + start_time = datetime.fromtimestamp(start, pytz.utc).astimezone(beijing_tz) + model, processor = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda()) + end = time.time() + end_time = datetime.fromtimestamp(end, pytz.utc).astimezone(beijing_tz) + print( + f"[Actor] {self._name} Model loaded in {end - start:.3f} seconds " + f"from {start_time.strftime('%Y-%m-%d %H:%M:%S')} " + f"to {end_time.strftime('%Y-%m-%d %H:%M:%S')}" + ) + return model, processor + class Mapper(OP): def __init__(self, *args, **kwargs): """ diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index b779c2b6c7..c642c0a378 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -116,11 +116,10 @@ def __init__( "" if frame_sampling_method == "all_keyframes" else f"-{frame_num}" ) - def compute_stats_single(self, sample, rank=None, context=False): + def compute_stats_single(self, sample, model, processor, rank=None, context=False): # check if it's computed already if StatsKeys.video_frames_aesthetics_score in sample[Fields.stats]: return sample - # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: sample[Fields.stats][StatsKeys.video_frames_aesthetics_score] = np.array([], dtype=np.float64) @@ -154,7 +153,7 @@ def compute_stats_single(self, sample, rank=None, context=False): if len(frame_images) > 0: # compute aesthetics_scores - model, processor = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda()) + # model, processor = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda()) inputs = processor(images=frame_images, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model(**inputs) diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index 3df033a851..8abdc63700 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -160,7 +160,7 @@ def __init__( trust_remote_code=trust_remote_code ) - def _process_single_sample(self, ori_sample, rank=None, context=False): + def _process_single_sample(self, ori_sample, model, processor, rank=None, context=False): # there is no videos in this sample if self.video_key not in ori_sample or not ori_sample[self.video_key]: @@ -181,7 +181,7 @@ def _process_single_sample(self, ori_sample, rank=None, context=False): text = sample[self.text_key] offset = 0 - model, processor = get_model(self.model_key, rank, self.use_cuda()) + # model, processor = get_model(self.model_key, rank, self.use_cuda()) for chunk in text.split(SpecialTokens.eoc): @@ -331,7 +331,7 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): generated_text_candidates_single_chunk[max_index]) return generated_text_per_chunk - def process_batched(self, samples, rank=None, context=False): + def process_batched(self, samples, model, processor, rank=None, context=False): """ :param samples: :return: @@ -356,6 +356,8 @@ def process_batched(self, samples, rank=None, context=False): if self.keep_original_sample: samples_after_generation.append(ori_sample) generated_samples = self._process_single_sample(ori_sample, + model, + processor, rank=rank, context=context) if len(generated_samples) != 0: diff --git a/demos/process_video_on_ray/configs/pr_demo.yaml b/demos/process_video_on_ray/configs/pr_demo.yaml new file mode 100644 index 0000000000..7000fc4847 --- /dev/null +++ b/demos/process_video_on_ray/configs/pr_demo.yaml @@ -0,0 +1,46 @@ +# Process config example for dataset + +# global parameters +# baseline +project_name: 'video-demo' +executor_type: 'ray' +ray_address: 'auto' # change to your ray cluster address, e.g., ray://: +dataset: + configs: + - type: local + path: './demos/process_video_on_ray/data/demo-dataset.jsonl' # path to your dataset directory or file +export_path: './outputs/demo/process_video_on_ray/my-dataset' + + +# process schedule +# a list of several process operators with their arguments +process: + # Mapper ops + - video_split_by_scene_mapper: # split videos into scene clips + detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`] + threshold: 27.0 # threshold passed to the detector + min_scene_len: 10 # minimum length of any scene + show_progress: false # whether to show progress from scenedetect + # # Filter ops + - video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos. + hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor + min_score: 0.3 # the min aesthetics score of filter range + max_score: 1.0 # the max aesthetics score of filter range + frame_sampling_method: 'uniform' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframe", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "uniform" with frame_num=3, considering that the number of keyframes can be large while their difference is usually small in terms of their aesthetics. + frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. + reduce_mode: avg # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min']. + any_or_all: any # keep this sample when any/all images meet the filter condition + mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched + - video_captioning_from_frames_mapper: # generate samples whose captions are generated based on an image-to-text model and sampled video frames. Captions from different frames will be concatenated to a single string. + hf_img2seq: 'Salesforce/blip2-opt-2.7b' # image-to-text model name on huggingface to generate caption + caption_num: 1 # how many candidate captions to generate for each video + keep_candidate_mode: 'random_any' # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"]. + keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default. + prompt: null # a string prompt to guide the generation of image-to-text model for all samples globally. It's None in default, which means no prompt provided. + prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default. + frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". + frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. + horizontal_flip: false # flip frame image horizontally (left to right). + vertical_flip: false # flip frame image vertically (top to bottom). + mem_required: '20GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched + \ No newline at end of file From 54434dfb394deb756a1a840dc49d4b73a91b4bec Mon Sep 17 00:00:00 2001 From: xcy Date: Tue, 5 Aug 2025 09:57:21 +0800 Subject: [PATCH 02/16] Combined: update ray_dataset.py + Ray Actor implementation --- data_juicer/core/data/ray_dataset.py | 81 ++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 9 deletions(-) diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index beb218fdfe..dfa4579fd2 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -248,18 +248,81 @@ def process_batch_arrow(table: pyarrow.Table): for i, actor in enumerate(actor_list): logger.info(f" Actor {i}: {actor._ray_actor_id.hex()[:6]}") - input_data = self.data.take_all() - op_buckets = {op._name: queue.Queue() for op in operators} + # 如果只有一个operator,直接处理 + if len(operators) == 1: + return self._process_single_operator(operators[0], actors[operators[0]._name]) - # 初始数据放入第一个operator的队列 - for data_item in input_data: - op_buckets[operators[0]._name].put(data_item) + # 创建operator队列(从第二个operator开始) + op_buckets = {op._name: queue.Queue(maxsize=100) for op in operators[1:]} # 添加maxsize防止内存爆炸 - # 添加结束标记,数量等于第一个operator的actor数量 - for _ in range(len(actors[operators[0]._name])): - op_buckets[operators[0]._name].put(None) + # 存储最终结果 + final_results = [] + result_lock = threading.Lock() + + # 启动后续operator的处理线程(先启动消费者) + threads = [] + for idx, op in enumerate(operators[1:], start=1): + for actor in actors[op._name]: + thread = threading.Thread( + target=self._process_operator, + args=(idx, op, actor, op_buckets, actors, operators, final_results, result_lock), + name=f"processor_{op._name}_{actor._ray_actor_id.hex()[:6]}", + daemon=True + ) + thread.start() + threads.append(thread) + + # 动态调整batch_size以控制内存使用 + # estimated_row_count = self.data.count() + # batch_size = max(1, min(1000, estimated_row_count // (len(actors[operators[0]._name]) * 10))) + batch_size = len(actors[operators[0]._name]) + print("\nBatchsize:", batch_size) + # 使用iter_batches并行处理第一个operator的数据 + first_op = operators[0] + first_op_actors = actors[first_op._name] + actor_index = 0 + + try: + for batch in self.data.iter_batches(batch_size=batch_size, batch_format="pyarrow"): + futures = [] + rows = [] + + for row_idx in range(len(batch)): + row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} + actor = first_op_actors[actor_index % len(first_op_actors)] + actor_index += 1 + futures.append(self._submit_to_actor(first_op, actor, row_data)) + rows.append(row_data) + + results = ray.get(futures) + + for result in results: + if len(operators) > 1: + op_buckets[operators[1]._name].put(result) + else: + with result_lock: + if isinstance(result, list): + final_results.extend(result) + elif result is not None: + final_results.append(result) + except Exception as e: + logger.error(f"Error processing data: {e}") + raise + + # 通知下一个 op 队列终止 + if len(operators) > 1: + for _ in range(len(actors[operators[1]._name])): + op_buckets[operators[1]._name].put(None) + + for thread in threads: + thread.join() + + if final_results: + self.data = from_items(final_results) + return self - # 存储每个操作的处理结果 + def _process_single_operator(self, op, op_actors): + """处理只有一个operator的情况""" final_results = [] # 为每个操作创建事件 events = {op._name: threading.Event() for op in operators} From 490746f3e7a88bf24c124c5d3193c92d6f5ccfa8 Mon Sep 17 00:00:00 2001 From: xcy Date: Tue, 5 Aug 2025 10:40:11 +0800 Subject: [PATCH 03/16] update ray_dataset.py --- data_juicer/core/data/ray_dataset.py | 365 +++++++++++++-------------- 1 file changed, 179 insertions(+), 186 deletions(-) diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index dfa4579fd2..65888d6e1b 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -7,7 +7,7 @@ import time from typing import Any, Dict, List, Literal, Optional, Union import uuid - +import numpy from data_juicer.core.ray_actor import Actor import pyarrow from jsonargparse import Namespace @@ -103,8 +103,8 @@ class RayDataset(DJDataset): def __init__(self, dataset: ray.data.Dataset, dataset_path: str = None, cfg: Optional[Namespace] = None) -> None: self.data = preprocess_dataset(dataset, dataset_path, cfg) self.num_proc = getattr(cfg, "np", getattr(cfg, "num_proc", None)) if cfg else None - self.gpu_pg = placement_group([{"CPU": 16, "GPU": 2}], strategy="STRICT_SPREAD") - ray.get(self.gpu_pg.ready()) + # self.gpu_pg = placement_group([{"CPU": 16, "GPU": 2}], strategy="STRICT_SPREAD") + # ray.get(self.gpu_pg.ready()) def schema(self) -> Schema: """Get dataset schema. @@ -155,21 +155,23 @@ def get_column(self, column: str, k: Optional[int] = None) -> List[Any]: return [row[column] for row in self.data.take()] - def process2(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + def process1(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: if operators is None: return self if not isinstance(operators, list): operators = [operators] for op in operators: self._run_single_op(op) + self.data = self.data.materialize() return self def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: - print("test start") if operators is None: return self if not isinstance(operators, list): operators = [operators] + + # 添加meta和stats列(如果需要) add_meta = False add_stats = False for op in operators: @@ -179,72 +181,60 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) - if isinstance(op, Filter): if Fields.stats not in columns: add_stats = True + if add_meta: def process_batch_arrow(table: pyarrow.Table): - new_column_data = [{} for _ in range(len(table))] - new_table = table.append_column(Fields.meta, [new_column_data]) - return new_table + new_column_data = [{} for _ in range(len(table))] + new_table = table.append_column(Fields.meta, [new_column_data]) + return new_table self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") + if add_stats: def process_batch_arrow(table: pyarrow.Table): - new_column_data = [{} for _ in range(len(table))] - new_table = table.append_column(Fields.stats, [new_column_data]) - return new_table + new_column_data = [{} for _ in range(len(table))] + new_table = table.append_column(Fields.stats, [new_column_data]) + return new_table self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") + # 创建所有operator的actors(保持原有逻辑不变) actors = {} - # 资源分配 - gpu_allocate = [0,1,1] - actor_allocate = [0,1,1] cpu_allocate = 1 - bundle_allocate = [0, 0, 0] + for idx, op in enumerate(operators): op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) + if op.use_cuda(): + op_proc = 1 actors[op._name] = [] - - if actor_allocate[idx] > 0 : - actor_num = actor_allocate[idx] - else: - actor_num = min(op_proc, self.data.count()) + + actor_num = min(op_proc, self.data.count()) + if op.use_cuda(): - num_gpus = gpu_allocate[idx] + num_gpus = 1 print(f"{op._name} allocate {num_gpus} GPUs.") - - for _ in range(actor_num): # 启动多个actor + for _ in range(actor_num): actor = Actor.options( name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", num_gpus=num_gpus, num_cpus=cpu_allocate, - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=self.gpu_pg, - placement_group_capture_child_tasks=True - ), - placement_group_bundle_index=bundle_allocate[idx], ).remote(op) - actor.load_model.remote() actors[op._name].append(actor) else: num_gpus = 0 print(f"{op._name} allocate in CPU.") - for _ in range(actor_num): # 启动多个actor + for _ in range(actor_num): actor = Actor.options( name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", num_gpus=num_gpus, num_cpus=cpu_allocate, - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=self.gpu_pg, - placement_group_capture_child_tasks=True - ), - placement_group_bundle_index=bundle_allocate[idx], ).remote(op) actors[op._name].append(actor) # 打印所有actor信息 for op_name, actor_list in actors.items(): - logger.info(f"Operator {op_name} 有以下actors:") + logger.info(f"Operator {op_name} has the following actors:") for i, actor in enumerate(actor_list): logger.info(f" Actor {i}: {actor._ray_actor_id.hex()[:6]}") @@ -273,211 +263,214 @@ def process_batch_arrow(table: pyarrow.Table): threads.append(thread) # 动态调整batch_size以控制内存使用 - # estimated_row_count = self.data.count() - # batch_size = max(1, min(1000, estimated_row_count // (len(actors[operators[0]._name]) * 10))) - batch_size = len(actors[operators[0]._name]) - print("\nBatchsize:", batch_size) + estimated_row_count = self.data.count() + + # 为每个operator设置batch_size为其actor数量 + batch_sizes = { + op._name: max(1, estimated_row_count // len(actors[op._name])) + for op in operators + } + # 使用iter_batches并行处理第一个operator的数据 first_op = operators[0] first_op_actors = actors[first_op._name] actor_index = 0 try: - for batch in self.data.iter_batches(batch_size=batch_size, batch_format="pyarrow"): - futures = [] - rows = [] - + for batch in self.data.iter_batches( + batch_size=batch_sizes[first_op._name], + batch_format="pyarrow" + ): + # 将batch转换为行数据进行处理 for row_idx in range(len(batch)): - row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} + # 提取单行数据 + row_data = {} + for col_name in batch.column_names: + col_data = batch.column(col_name) + row_data[col_name] = col_data[row_idx].as_py() + + # 选择actor进行负载均衡 actor = first_op_actors[actor_index % len(first_op_actors)] actor_index += 1 - futures.append(self._submit_to_actor(first_op, actor, row_data)) - rows.append(row_data) - - results = ray.get(futures) - - for result in results: + + # 异步处理数据 + future = self._submit_to_actor(first_op, actor, row_data) + result = ray.get(future) + + # 将结果放入第二个operator的队列 if len(operators) > 1: op_buckets[operators[1]._name].put(result) else: with result_lock: if isinstance(result, list): final_results.extend(result) - elif result is not None: + else: final_results.append(result) + except Exception as e: logger.error(f"Error processing data: {e}") raise - # 通知下一个 op 队列终止 + # 添加结束标记到第二个operator的队列 if len(operators) > 1: for _ in range(len(actors[operators[1]._name])): op_buckets[operators[1]._name].put(None) + # 等待所有线程完成 for thread in threads: thread.join() + # 返回最终结果 if final_results: self.data = from_items(final_results) return self - + def _process_single_operator(self, op, op_actors): """处理只有一个operator的情况""" final_results = [] - # 为每个操作创建事件 - events = {op._name: threading.Event() for op in operators} - # 将第一个操作的事件标记为已触发 - events[operators[0]._name].set() - - # 用于跟踪每个operator的完成情况 - op_completion_count = {op._name: 0 for op in operators} - # 用于同步同一operator的多个actor - op_actor_locks = {op._name: threading.Lock() for op in operators} - - def process_operator(op_idx, op, actor): - op_name = op._name - input_queue = op_buckets[op_name] - - # 确定输出队列 - if op_idx + 1 < len(operators): - output_queue = op_buckets[operators[op_idx + 1]._name] - else: - output_queue = None # 最后一个operator - - logger.info(f"Starting processor for {op_name} actor {actor._ray_actor_id.hex()[:6]}") - - start_time = time.time() + actor_index = 0 + + # 动态调整batch_size + estimated_row_count = self.data.count() + batch_size = max(1, min(1000, estimated_row_count // (len(op_actors) * 10))) + + for batch in self.data.iter_batches( + batch_size=batch_size, + batch_format="pyarrow" + ): + # 将batch转换为行数据进行处理 + for row_idx in range(len(batch)): + # 提取单行数据 + row_data = {} + for col_name in batch.column_names: + col_data = batch.column(col_name) + row_data[col_name] = col_data[row_idx].as_py() + + # 选择actor进行负载均衡 + actor = op_actors[actor_index % len(op_actors)] + actor_index += 1 + + # 处理数据 + future = self._submit_to_actor(op, actor, row_data) + result = ray.get(future) + + if isinstance(result, list): + final_results.extend(result) + elif result is not None: + final_results.append(result) + + if final_results: + self.data = from_items(final_results) + return self - while True: - try: - # 等待前一个操作完成,才能开始处理当前操作 - events[op_name].wait() + def _submit_to_actor(self, op, actor, data_item): + """提交数据到actor进行处理""" + if isinstance(op, Mapper): + if op.use_cuda(): + if op.is_batched_op(): + data_item = self.transform_to_2d_format(data_item) + return actor.mapper_cuda_batched.remote(data_item) + else: + return actor.mapper_cuda.remote(data_item) + else: + return actor.mapper_cpu.remote(data_item) + + elif isinstance(op, Filter): + if op.use_cuda(): + if op.is_batched_op(): + return actor.filter_cuda_batched.remote(data_item) + else: + return actor.filter_cuda_single.remote(data_item) + else: + if op.is_batched_op(): + return actor.filter_cpu_batched.remote(data_item) + else: + return actor.filter_cpu_single.remote(data_item) - # 从输入队列获取数据,设置超时避免无限等待 - data_item = input_queue.get(timeout=10.0) - - # 检查结束标记 - if data_item is None: - with op_actor_locks[op_name]: - op_completion_count[op_name] += 1 - # 当所有actor都收到结束标记后,才传递到下一个队列 - if op_completion_count[op_name] == len(actors[op_name]) and output_queue: - # 向下一个operator传递与下一个operator的actor数量相同的结束标记 - next_op_name = operators[op_idx + 1]._name if op_idx + 1 < len(operators) else None - if next_op_name: - for _ in range(len(actors[next_op_name])): - output_queue.put(None) - break - - # 处理数据 - future = None - if isinstance(op, Mapper): - if op.use_cuda(): - if op.is_batched_op == True: - future = actor.mapper_cuda_batched.remote(self.transform_to_2d_format(data_item)) + def _process_operator(self, op_idx, op, actor, op_buckets, actors, operators, final_results, result_lock): + op_name = op._name + input_queue = op_buckets[op_name] + + # 确定输出队列 + if op_idx + 1 < len(operators): + output_queue = op_buckets[operators[op_idx + 1]._name] + else: + output_queue = None + + logger.info(f"Starting processor for {op_name} actor {actor._ray_actor_id.hex()[:6]}") + + start_time = time.time() + processed_count = 0 + + while True: + try: + # 从输入队列获取数据 + data_item = input_queue.get(timeout=30.0) # 增加timeout + + # 检查结束标记 + if data_item is None: + if output_queue: + # 向下一个operator传递结束标记 + for _ in range(len(actors[operators[op_idx + 1]._name])): + output_queue.put(None) + break + + # 处理数据 + future = self._submit_to_actor(op, actor, data_item) + results = ray.get(future) + processed_count += 1 + + if isinstance(op, Mapper): + if output_queue: + output_queue.put(results) + else: + with result_lock: + if isinstance(results, list): + final_results.extend(results) else: - future = actor.mapper_cuda.remote(data_item) - else: - future = actor.mapper_cpu.remote(data_item) - - result = ray.get(future) - # print("res:", result) - # 将结果发送到下一个队列 + final_results.append(results) + + elif isinstance(op, Filter): + if results: if output_queue: - output_queue.put(result) - else: - final_results.append(result) - - elif isinstance(op, Filter): - if op.use_cuda(): - if op.is_batched_op == True: - future = actor.filter_cuda_batched.remote(data_item) - else: - future = actor.filter_cuda_single.remote(data_item) - else: - if op.is_batched_op == True: - future = actor.filter_cpu_batched.remote(data_item) - else: - future = actor.filter_cpu_single.remote(data_item) - - results = ray.get(future) - - if results: if isinstance(results, list): for result in results: - if output_queue: - output_queue.put(result) - else: - final_results.append(result) + output_queue.put(result) else: - if output_queue: - output_queue.put(results) + output_queue.put(results) + else: + with result_lock: + if isinstance(results, list): + final_results.extend(results) else: final_results.append(results) - - if op.stats_export_path is not None: - actor.export_stats(results, op.stats_export_path) - - # 标记任务完成 - input_queue.task_done() - - # 处理完成后,设置当前操作的事件,允许下一个操作开始 - if op_idx + 1 < len(operators): - events[operators[op_idx + 1]._name].set() - - except queue.Empty: - logger.info(f"{op_name} actor {actor._ray_actor_id.hex()[:6]} queue timeout, checking if pipeline is complete") - continue - except Exception as e: - logger.error(f"Error in {op_name} actor {actor._ray_actor_id.hex()[:6]}: {e}") - input_queue.task_done() - break - end_time = time.time() - logger.info(f"Processor for {op_name} actor {actor._ray_actor_id.hex()[:6]} completed in {end_time - start_time:.2f} seconds") + # 标记任务完成 + input_queue.task_done() - # 为每个operator的每个actor启动处理线程 - threads = [] - for idx, op in enumerate(operators): - for actor in actors[op._name]: - thread = threading.Thread( - target=process_operator, - args=(idx, op, actor), - name=f"processor_{op._name}_{actor._ray_actor_id.hex()[:6]}" - ) - thread.daemon = True - thread.start() - threads.append(thread) + except queue.Empty: + logger.warning(f"{op_name} actor {actor._ray_actor_id.hex()[:6]} queue timeout, processed {processed_count} items") + continue + except Exception as e: + logger.error(f"Error in {op_name} actor {actor._ray_actor_id.hex()[:6]}: {e}") + input_queue.task_done() + break - # 等待所有线程完成 - for thread in threads: - thread.join() - print("\nfinal res:", final_results) - # 合并最终结果 - # flattened_data = list(chain.from_iterable(final_results)) - # print("\nfinal res:", flattened_data) - # self.data = from_items(flattened_data) - self.data = from_items(final_results) - return self.data - + end_time = time.time() + logger.info(f"Processor for {op_name} actor {actor._ray_actor_id.hex()[:6]} completed in {end_time - start_time:.2f} seconds, processed {processed_count} items") def transform_to_2d_format(self, data): """ 将第二种格式的数据转换为第一种嵌套格式 根据 __dj__source_file__ 的唯一值来分组所有字段 """ - print("data before trans", data) + # print("data before trans", data) if '__dj__source_file__' not in data: raise ValueError("数据中必须包含 '__dj__source_file__' 字段") source_files = data['__dj__source_file__'] # 获取唯一的源文件并保持顺序 - unique_sources = [] - seen = set() - for source in source_files: - if source not in seen: - unique_sources.append(source) - seen.add(source) + unique_sources = list(dict.fromkeys(source_files)) # 为每个唯一源文件创建索引映射 source_to_indices = {} @@ -529,7 +522,7 @@ def transform_to_2d_format(self, data): indices = source_to_indices[source] # 为每个组重复该值 transformed_data[field_name].append(field_value) - print("data after trans", transformed_data) + # print("data after trans", transformed_data) return transformed_data From d3b7d514b0d9a93c7a39959d9364d7a8b0438833 Mon Sep 17 00:00:00 2001 From: xcy Date: Tue, 5 Aug 2025 11:03:24 +0800 Subject: [PATCH 04/16] Implementing op parallel data processing based on Ray Actor --- data_juicer/core/data/ray_dataset.py | 17 +- data_juicer/core/ray_actor.py | 8 +- data_juicer/ops/base_op.py | 1 + .../ops/filter/video_aesthetics_filter.py | 68 ++++++- .../video_captioning_from_frames_mapper.py | 175 +++++++++++++++++- .../process_video_on_ray/configs/pr_demo.yaml | 2 +- 6 files changed, 251 insertions(+), 20 deletions(-) diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index 65888d6e1b..508cfe3407 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -200,35 +200,34 @@ def process_batch_arrow(table: pyarrow.Table): # 创建所有operator的actors(保持原有逻辑不变) actors = {} - cpu_allocate = 1 for idx, op in enumerate(operators): - op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) if op.use_cuda(): op_proc = 1 + else: + op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) + actors[op._name] = [] actor_num = min(op_proc, self.data.count()) if op.use_cuda(): - num_gpus = 1 - print(f"{op._name} allocate {num_gpus} GPUs.") + print(f"{op._name} allocate {op.gpu_required} GPUs.") for _ in range(actor_num): actor = Actor.options( name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", - num_gpus=num_gpus, - num_cpus=cpu_allocate, + num_gpus=op.gpu_required, + num_cpus=op.cpu_required, ).remote(op) actor.load_model.remote() actors[op._name].append(actor) else: - num_gpus = 0 print(f"{op._name} allocate in CPU.") for _ in range(actor_num): actor = Actor.options( name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", - num_gpus=num_gpus, - num_cpus=cpu_allocate, + num_gpus=0, + num_cpus=op.cpu_required, ).remote(op) actors[op._name].append(actor) diff --git a/data_juicer/core/ray_actor.py b/data_juicer/core/ray_actor.py index bacc57b30d..bdef9d86da 100644 --- a/data_juicer/core/ray_actor.py +++ b/data_juicer/core/ray_actor.py @@ -37,7 +37,7 @@ def mapper_cuda(self, data): def mapper_cuda_batched(self, data): if not self._model_loaded: self.load_model() # 确保调用前模型已加载 - data = self.op.process_batched(data, self.model, self.processor) + data = self.op.process_batched_actor(data, self.model, self.processor) return data def mapper_cpu(self, data): @@ -48,7 +48,7 @@ def mapper_cpu(self, data): def filter_cuda_single(self, data): if not self._model_loaded: self.load_model() - data = self.op.compute_stats_single(data, self.model, self.processor) + data = self.op.compute_stats_single_actor(data, self.model, self.processor) keep = self.op.process_single(data) if keep: return data @@ -117,7 +117,3 @@ def filter_cpu_batched(self, data): return filtered_data - def export_stats(self, data, export_path): - - return data.write_json(export_path, force_ascii=False) - diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index c46e677870..2d7db36571 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -196,6 +196,7 @@ def __init__(self, *args, **kwargs): self.num_proc = kwargs.get("num_proc", None) self.cpu_required = kwargs.get("cpu_required", 1) self.mem_required = kwargs.get("mem_required", 0) + self.gpu_required = kwargs.get("gpu_required", 1) if isinstance(self.mem_required, str): self.mem_required = size_to_bytes(self.mem_required) / 1024**3 diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index c642c0a378..d1ee055580 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -116,7 +116,7 @@ def __init__( "" if frame_sampling_method == "all_keyframes" else f"-{frame_num}" ) - def compute_stats_single(self, sample, model, processor, rank=None, context=False): + def compute_stats_single_actor(self, sample, model, processor, rank=None, context=False): # check if it's computed already if StatsKeys.video_frames_aesthetics_score in sample[Fields.stats]: return sample @@ -177,6 +177,72 @@ def compute_stats_single(self, sample, model, processor, rank=None, context=Fals sample[Fields.stats][StatsKeys.video_frames_aesthetics_score] = aesthetics_scores + if not context: + for vid_key in videos: + close_video(videos[vid_key]) + + return sample + def compute_stats_single(self, sample, rank=None, context=False): + # check if it's computed already + if StatsKeys.video_frames_aesthetics_score in sample[Fields.stats]: + return sample + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + sample[Fields.stats][StatsKeys.video_frames_aesthetics_score] = np.array([], dtype=np.float64) + return sample + + # load videos + loaded_video_keys = sample[self.video_key] + sample, videos = load_data_with_context(sample, context, loaded_video_keys, load_video) + + aesthetics_scores = [] + for key, video in videos.items(): + sampled_frames_key = key + self.sampled_frames_key_suffix + if video is None: + continue + elif context and sampled_frames_key in sample[Fields.context]: + # sampled frames can be found in the context + frames = sample[Fields.context][sampled_frames_key] + else: + # extract frame images + if self.frame_sampling_method == "all_keyframes": + frames = extract_key_frames(video) + elif self.frame_sampling_method == "uniform": + frames = extract_video_frames_uniformly(video, self.frame_num) + else: + frames = [] + + # store the sampled frames in the context + if context: + sample[Fields.context][sampled_frames_key] = frames + frame_images = [frame.to_image() for frame in frames] + + if len(frame_images) > 0: + # compute aesthetics_scores + model, processor = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda()) + inputs = processor(images=frame_images, return_tensors="pt").to(model.device) + with torch.no_grad(): + outputs = model(**inputs) + if self.need_normalized_by_ten: + aesthetics_score = outputs.logits / 10.0 + else: + aesthetics_score = outputs.logits + + if self.reduce_mode == "avg": + aesthetics_score = float(aesthetics_score.mean()) + elif self.reduce_mode == "max": + aesthetics_score = float(aesthetics_score.max()) + else: + aesthetics_score = float(aesthetics_score.min()) + else: + aesthetics_score = 0.0 + + aesthetics_scores.append(aesthetics_score) + + logger.debug(f"aesthetics_score: {aesthetics_scores}") + + sample[Fields.stats][StatsKeys.video_frames_aesthetics_score] = aesthetics_scores + if not context: for vid_key in videos: close_video(videos[vid_key]) diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index 8abdc63700..c96cdf5755 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -160,7 +160,7 @@ def __init__( trust_remote_code=trust_remote_code ) - def _process_single_sample(self, ori_sample, model, processor, rank=None, context=False): + def _process_single_sample_actor(self, ori_sample, model, processor, rank=None, context=False): # there is no videos in this sample if self.video_key not in ori_sample or not ori_sample[self.video_key]: @@ -331,7 +331,7 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): generated_text_candidates_single_chunk[max_index]) return generated_text_per_chunk - def process_batched(self, samples, model, processor, rank=None, context=False): + def process_batched_actor(self, samples, model, processor, rank=None, context=False): """ :param samples: :return: @@ -355,7 +355,7 @@ def process_batched(self, samples, model, processor, rank=None, context=False): for ori_sample in reconstructed_samples: if self.keep_original_sample: samples_after_generation.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample, + generated_samples = self._process_single_sample_actor(ori_sample, model, processor, rank=rank, @@ -369,3 +369,172 @@ def process_batched(self, samples, model, processor, rank=None, context=False): res_samples[key] = [s[key] for s in samples_after_generation] return res_samples + + + def _process_single_sample(self, ori_sample, rank=None, context=False): + + # there is no videos in this sample + if self.video_key not in ori_sample or not ori_sample[self.video_key]: + return [] + + # the generated results + generated_samples = [ + copy.deepcopy(ori_sample) + for _ in range(self.num_newly_generated_samples) + ] + for generated_sample in generated_samples: + generated_sample[self.text_key] = '' + + # load videos + loaded_video_keys = ori_sample[self.video_key] + sample, videos = load_data_with_context(ori_sample, context, + loaded_video_keys, load_video) + + text = sample[self.text_key] + offset = 0 + model, processor = get_model(self.model_key, rank, self.use_cuda()) + + for chunk in text.split(SpecialTokens.eoc): + + video_count = chunk.count(SpecialTokens.video) + + # no video or no text + if video_count == 0 or len(chunk.strip()) == 0: + continue + else: + text_with_only_special_tokens = remove_non_special_tokens( + chunk) + # generate candidate caption(s) in batch manner + generated_text_candidates_single_chunk = [ + [] for _ in range(self.caption_num) + ] + for video_key in loaded_video_keys[offset:offset + + video_count]: + video = videos[video_key] + video_frame_videos_chunk = [] + # extract frame videos + if self.frame_sampling_method == 'all_keyframes': + frames = extract_key_frames(video) + elif self.frame_sampling_method == 'uniform': + frames = extract_video_frames_uniformly( + video, self.frame_num) + else: + frames = [] + frame_videos = [frame.to_image() for frame in frames] + for frame in frame_videos: + if self.horizontal_flip: + frame = ImageOps.mirror(frame) + if self.vertical_flip: + frame = ImageOps.flip(frame) + video_frame_videos_chunk.append(frame) + + # construct prompts + if self.prompt_key and isinstance( + ori_sample[self.prompt_key], str): + # check prompt_key is not None, and it's a str + # in the sample + prompt_texts = [ori_sample[self.prompt_key] + ] * len(video_frame_videos_chunk) + elif self.prompt and isinstance(self.prompt, str): + # check prompt is not None, and it's a str + prompt_texts = [self.prompt + ] * len(video_frame_videos_chunk) + else: + prompt_texts = None + + inputs = processor( + text=prompt_texts, + images=video_frame_videos_chunk, + return_tensors='pt', + ).to(model.device) + with torch.no_grad(): + for i in range(self.caption_num): + generated_ids = model.generate(**inputs, + max_new_tokens=128, + do_sample=True) + generated_text = processor.batch_decode( + generated_ids, skip_special_tokens=True) + generated_text_candidates_single_chunk[i] += [ + '. '.join([txt.strip() for txt in generated_text]) + ] + + # 3. insert a list of generated captions into the positions of + # subsequent placeholders in the original string + new_generated_text_all_videos = [ + [] for _ in range(self.num_newly_generated_samples) + ] + # new_generated_text_all_videos is a helper array, + # element [i][j] + # denotes the reduced $i$-th result for the $j$-th video + + # reduce the captions according to given mode video by video + for j in range(video_count): + new_generated_text_per_video = self._reduce_captions( + chunk, + [ + captions[j] for captions in + generated_text_candidates_single_chunk + ], + ) + assert self.num_newly_generated_samples == len( + new_generated_text_per_video) + for i in range(len(new_generated_text_per_video)): + new_generated_text_all_videos[i].append( + new_generated_text_per_video[i]) + + # insert the captions according to given mode + place_holders = [SpecialTokens.video] * video_count + for i in range(self.num_newly_generated_samples): + generated_text_per_chunk = insert_texts_after_placeholders( + original_string=text_with_only_special_tokens, + placeholders=place_holders, + new_texts=new_generated_text_all_videos[i], + ) + generated_samples[i][ + self. + text_key] += f'{generated_text_per_chunk}' \ + f'{SpecialTokens.eoc}' + + offset += video_count + + if not context: + for vid_key in videos: + close_video(videos[vid_key]) + return generated_samples + + def process_batched(self, samples, rank=None, context=False): + """ + :param samples: + :return: + + Note: + This is a batched_OP, whose the input and output type are + both list. Suppose there are $N$ input sample list with batch + size as $b$, and denote caption_num as $M$. + the number of total samples after generation is $2Nb$ + for 'random_any' and 'similar_one' mode, + and $(1+M)Nb$ for 'all' mode. + """ + # reconstruct samples from "dict of lists" to "list of dicts" + reconstructed_samples = [] + for i in range(len(samples[self.text_key])): + reconstructed_samples.append( + {key: samples[key][i] + for key in samples}) + samples_after_generation = [] + # do generation for each sample within the batch + for ori_sample in reconstructed_samples: + if self.keep_original_sample: + samples_after_generation.append(ori_sample) + generated_samples = self._process_single_sample(ori_sample, + rank=rank, + context=context) + if len(generated_samples) != 0: + samples_after_generation.extend(generated_samples) + # reconstruct samples from "list of dicts" to "dict of lists" + keys = samples_after_generation[0].keys() + res_samples = {} + for key in keys: + res_samples[key] = [s[key] for s in samples_after_generation] + + return res_samples diff --git a/demos/process_video_on_ray/configs/pr_demo.yaml b/demos/process_video_on_ray/configs/pr_demo.yaml index 7000fc4847..ffde4c3e93 100644 --- a/demos/process_video_on_ray/configs/pr_demo.yaml +++ b/demos/process_video_on_ray/configs/pr_demo.yaml @@ -9,7 +9,7 @@ dataset: configs: - type: local path: './demos/process_video_on_ray/data/demo-dataset.jsonl' # path to your dataset directory or file -export_path: './outputs/demo/process_video_on_ray/my-dataset' +export_path: './outputs/demo/process_video_on_ray/demo-dataset' # process schedule From a81e90c74406377ec953a9ad210a2442d64c463a Mon Sep 17 00:00:00 2001 From: xcy Date: Tue, 5 Aug 2025 17:35:21 +0800 Subject: [PATCH 05/16] update 0805 --- data_juicer/core/data/ray_dataset.py | 218 +++++++----------- data_juicer/core/executor/ray_executor.py | 7 +- .../ops/filter/video_watermark_filter.py | 63 +++++ .../ops/mapper/video_split_by_scene_mapper.py | 122 +++++----- .../process_video_on_ray/configs/pr_demo.yaml | 26 ++- 5 files changed, 237 insertions(+), 199 deletions(-) diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index 508cfe3407..5a86e6902e 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -155,7 +155,7 @@ def get_column(self, column: str, k: Optional[int] = None) -> List[Any]: return [row[column] for row in self.data.take()] - def process1(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: if operators is None: return self if not isinstance(operators, list): @@ -165,7 +165,7 @@ def process1(self, operators, *, exporter=None, checkpointer=None, tracer=None) self.data = self.data.materialize() return self - def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + def process_parallel(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: if operators is None: return self if not isinstance(operators, list): @@ -198,57 +198,45 @@ def process_batch_arrow(table: pyarrow.Table): self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") - # 创建所有operator的actors(保持原有逻辑不变) + # Step 1: 创建所有 operator 的 actor actors = {} - - for idx, op in enumerate(operators): - if op.use_cuda(): - op_proc = 1 - else: - op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) - + for op in operators: + op_proc = 1 if op.use_cuda() else calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) + actor_num = min(op_proc, self.data.count()) actors[op._name] = [] - actor_num = min(op_proc, self.data.count()) + for _ in range(actor_num): + actor = Actor.options( + name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", + num_gpus=op.gpu_required if op.use_cuda() else 0, + num_cpus=op.cpu_required + ).remote(op) - if op.use_cuda(): - print(f"{op._name} allocate {op.gpu_required} GPUs.") - for _ in range(actor_num): - actor = Actor.options( - name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", - num_gpus=op.gpu_required, - num_cpus=op.cpu_required, - ).remote(op) + if op.use_cuda(): actor.load_model.remote() - actors[op._name].append(actor) - else: - print(f"{op._name} allocate in CPU.") - for _ in range(actor_num): - actor = Actor.options( - name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", - num_gpus=0, - num_cpus=op.cpu_required, - ).remote(op) - actors[op._name].append(actor) - - # 打印所有actor信息 - for op_name, actor_list in actors.items(): - logger.info(f"Operator {op_name} has the following actors:") - for i, actor in enumerate(actor_list): - logger.info(f" Actor {i}: {actor._ray_actor_id.hex()[:6]}") - - # 如果只有一个operator,直接处理 - if len(operators) == 1: - return self._process_single_operator(operators[0], actors[operators[0]._name]) - # 创建operator队列(从第二个operator开始) - op_buckets = {op._name: queue.Queue(maxsize=100) for op in operators[1:]} # 添加maxsize防止内存爆炸 + actors[op._name].append(actor) + + logger.info(f"Operator {op._name} has {len(actors[op._name])} actor(s).") + + # Step 2: 设置每个 operator 的 batch size 为其 actor 数量 + batch_sizes = { + op._name: max(1, len(actors[op._name])) + for op in operators + } + + logger.info(f"Batch sizes per operator: {batch_sizes}") + + # Step 3: 如果只有一个 operator,单独处理 + if len(operators) == 1: + return self._process_single_operator(operators[0], actors[operators[0]._name], batch_sizes[operators[0]._name]) - # 存储最终结果 + # Step 4: 构建 op_buckets 和结果缓存 + op_buckets = {op._name: queue.Queue(maxsize=100) for op in operators[1:]} final_results = [] result_lock = threading.Lock() - # 启动后续operator的处理线程(先启动消费者) + # Step 5: 启动 consumer 线程(从第二个 operator 开始) threads = [] for idx, op in enumerate(operators[1:], start=1): for actor in actors[op._name]: @@ -261,160 +249,110 @@ def process_batch_arrow(table: pyarrow.Table): thread.start() threads.append(thread) - # 动态调整batch_size以控制内存使用 - estimated_row_count = self.data.count() - - # 为每个operator设置batch_size为其actor数量 - batch_sizes = { - op._name: max(1, estimated_row_count // len(actors[op._name])) - for op in operators - } - - # 使用iter_batches并行处理第一个operator的数据 + # Step 6: 首个 operator 执行数据提交 first_op = operators[0] first_op_actors = actors[first_op._name] actor_index = 0 - + try: for batch in self.data.iter_batches( batch_size=batch_sizes[first_op._name], batch_format="pyarrow" ): - # 将batch转换为行数据进行处理 + futures = [] for row_idx in range(len(batch)): - # 提取单行数据 - row_data = {} - for col_name in batch.column_names: - col_data = batch.column(col_name) - row_data[col_name] = col_data[row_idx].as_py() - - # 选择actor进行负载均衡 + row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} actor = first_op_actors[actor_index % len(first_op_actors)] actor_index += 1 - - # 异步处理数据 - future = self._submit_to_actor(first_op, actor, row_data) - result = ray.get(future) - - # 将结果放入第二个operator的队列 + futures.append(self._submit_to_actor(first_op, actor, row_data)) + + results = ray.get(futures) + for result in results: if len(operators) > 1: op_buckets[operators[1]._name].put(result) else: with result_lock: if isinstance(result, list): final_results.extend(result) - else: + elif result is not None: final_results.append(result) except Exception as e: logger.error(f"Error processing data: {e}") raise - # 添加结束标记到第二个operator的队列 - if len(operators) > 1: - for _ in range(len(actors[operators[1]._name])): - op_buckets[operators[1]._name].put(None) + # Step 7: 通知下游结束,等待线程完成 + for _ in range(len(actors[operators[1]._name])): + op_buckets[operators[1]._name].put(None) - # 等待所有线程完成 - for thread in threads: - thread.join() + for thread in threads: + thread.join() - # 返回最终结果 if final_results: self.data = from_items(final_results) + return self - - def _process_single_operator(self, op, op_actors): - """处理只有一个operator的情况""" + + + def _process_single_operator(self, op, op_actors, batch_size): final_results = [] actor_index = 0 - - # 动态调整batch_size - estimated_row_count = self.data.count() - batch_size = max(1, min(1000, estimated_row_count // (len(op_actors) * 10))) - - for batch in self.data.iter_batches( - batch_size=batch_size, - batch_format="pyarrow" - ): - # 将batch转换为行数据进行处理 + + logger.info(f"Single operator {op._name} running with batch_size = {batch_size}") + + for batch in self.data.iter_batches(batch_size=batch_size, batch_format="pyarrow"): + futures = [] for row_idx in range(len(batch)): - # 提取单行数据 - row_data = {} - for col_name in batch.column_names: - col_data = batch.column(col_name) - row_data[col_name] = col_data[row_idx].as_py() - - # 选择actor进行负载均衡 + row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} actor = op_actors[actor_index % len(op_actors)] actor_index += 1 - - # 处理数据 - future = self._submit_to_actor(op, actor, row_data) - result = ray.get(future) - + futures.append(self._submit_to_actor(op, actor, row_data)) + results = ray.get(futures) + for result in results: if isinstance(result, list): final_results.extend(result) elif result is not None: final_results.append(result) - + if final_results: self.data = from_items(final_results) + return self + def _submit_to_actor(self, op, actor, data_item): - """提交数据到actor进行处理""" if isinstance(op, Mapper): if op.use_cuda(): - if op.is_batched_op(): - data_item = self.transform_to_2d_format(data_item) - return actor.mapper_cuda_batched.remote(data_item) - else: - return actor.mapper_cuda.remote(data_item) + return actor.mapper_cuda_batched.remote(self.transform_to_2d_format(data_item)) if op.is_batched_op() else actor.mapper_cuda.remote(data_item) + # return actor.mapper_cuda_batched.remote(data_item) if op.is_batched_op() else actor.mapper_cuda.remote(data_item) else: return actor.mapper_cpu.remote(data_item) - + elif isinstance(op, Filter): if op.use_cuda(): - if op.is_batched_op(): - return actor.filter_cuda_batched.remote(data_item) - else: - return actor.filter_cuda_single.remote(data_item) + return actor.filter_cuda_batched.remote(data_item) if op.is_batched_op() else actor.filter_cuda_single.remote(data_item) else: - if op.is_batched_op(): - return actor.filter_cpu_batched.remote(data_item) - else: - return actor.filter_cpu_single.remote(data_item) + return actor.filter_cpu_batched.remote(data_item) if op.is_batched_op() else actor.filter_cpu_single.remote(data_item) + def _process_operator(self, op_idx, op, actor, op_buckets, actors, operators, final_results, result_lock): op_name = op._name input_queue = op_buckets[op_name] - - # 确定输出队列 - if op_idx + 1 < len(operators): - output_queue = op_buckets[operators[op_idx + 1]._name] - else: - output_queue = None - - logger.info(f"Starting processor for {op_name} actor {actor._ray_actor_id.hex()[:6]}") + output_queue = op_buckets.get(operators[op_idx + 1]._name) if op_idx + 1 < len(operators) else None - start_time = time.time() + logger.info(f"Starting processor for {op_name} actor {actor._ray_actor_id.hex()[:6]}") processed_count = 0 + start_time = time.time() while True: try: - # 从输入队列获取数据 - data_item = input_queue.get(timeout=30.0) # 增加timeout - - # 检查结束标记 + data_item = input_queue.get(timeout=30.0) if data_item is None: if output_queue: - # 向下一个operator传递结束标记 for _ in range(len(actors[operators[op_idx + 1]._name])): output_queue.put(None) break - - # 处理数据 + future = self._submit_to_actor(op, actor, data_item) results = ray.get(future) processed_count += 1 @@ -428,7 +366,7 @@ def _process_operator(self, op_idx, op, actor, op_buckets, actors, operators, fi final_results.extend(results) else: final_results.append(results) - + elif isinstance(op, Filter): if results: if output_queue: @@ -444,7 +382,6 @@ def _process_operator(self, op_idx, op, actor, op_buckets, actors, operators, fi else: final_results.append(results) - # 标记任务完成 input_queue.task_done() except queue.Empty: @@ -456,7 +393,8 @@ def _process_operator(self, op_idx, op, actor, op_buckets, actors, operators, fi break end_time = time.time() - logger.info(f"Processor for {op_name} actor {actor._ray_actor_id.hex()[:6]} completed in {end_time - start_time:.2f} seconds, processed {processed_count} items") + logger.info(f"Processor for {op_name} actor {actor._ray_actor_id.hex()[:6]} completed in {end_time - start_time:.2f}s, processed {processed_count} items") + def transform_to_2d_format(self, data): """ 将第二种格式的数据转换为第一种嵌套格式 @@ -464,7 +402,11 @@ def transform_to_2d_format(self, data): """ # print("data before trans", data) if '__dj__source_file__' not in data: - raise ValueError("数据中必须包含 '__dj__source_file__' 字段") + if 'videos' not in data: + raise ValueError("数据中缺少 '__dj__source_file__' 字段且无法从 'videos' 字段推断") + # print(data) + data['__dj__source_file__'] = data['videos'] + source_files = data['__dj__source_file__'] diff --git a/data_juicer/core/executor/ray_executor.py b/data_juicer/core/executor/ray_executor.py index 9b0b0dbcaf..dd95a3bbe1 100644 --- a/data_juicer/core/executor/ray_executor.py +++ b/data_juicer/core/executor/ray_executor.py @@ -73,6 +73,8 @@ def __init__(self, cfg: Optional[Namespace] = None): keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds, **self.cfg.export_extra_args, ) + self.op_enable_parallel = True + # self.op_enable_parallel = False def run(self, load_data_np: Optional[PositiveInt] = None, skip_return=False): """ @@ -99,7 +101,10 @@ def run(self, load_data_np: Optional[PositiveInt] = None, skip_return=False): # 3. data process logger.info("Processing data...") tstart = time.time() - dataset.process(ops) + if self.op_enable_parallel: + dataset.process_parallel(ops) + else: + dataset.process(ops) # 4. data export logger.info("Exporting dataset to disk...") diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 22808d492b..afc3c0c622 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -104,6 +104,69 @@ def __init__( "" if frame_sampling_method == "all_keyframes" else f"-{frame_num}" ) + def compute_stats_single_actor(self, sample, model, processor, rank=None, context=False): + # check if it's computed already + if StatsKeys.video_watermark_prob in sample[Fields.stats]: + return sample + + # there is no videos in this sample + if self.video_key not in sample or not sample[self.video_key]: + sample[Fields.stats][StatsKeys.video_watermark_prob] = np.array([], dtype=np.float64) + return sample + + # load videos + loaded_video_keys = sample[self.video_key] + sample, videos = load_data_with_context(sample, context, loaded_video_keys, load_video) + + watermark_probs = [] + # model, processor = get_model(self.model_key, rank, self.use_cuda()) + + for video_key, video in videos.items(): + sampled_frames_key = video_key + self.sampled_frames_key_suffix + + # extract frame images + if context and sampled_frames_key in sample[Fields.context]: + frames = sample[Fields.context][sampled_frames_key] + else: + if self.frame_sampling_method == "all_keyframes": + frames = extract_key_frames(video) + elif self.frame_sampling_method == "uniform": + frames = extract_video_frames_uniformly(video, self.frame_num) + else: + frames = [] + + # store the sampled frames in the context + if context: + sample[Fields.context][sampled_frames_key] = frames + + frame_images = [frame.to_image() for frame in frames] + + if len(frame_images) > 0: + inputs = processor(images=frame_images, return_tensors="pt") + inputs = inputs.to(model.device) + outputs = model(**inputs) + logits = outputs.logits + cur_probs = [probs[1] for probs in torch.softmax(logits, dim=-1)] + cur_probs = torch.Tensor(cur_probs) + + if self.reduce_mode == "avg": + cur_prob = cur_probs.mean() + elif self.reduce_mode == "max": + cur_prob = cur_probs.max() + else: + cur_prob = cur_probs.min() + else: + cur_prob = 0.0 + watermark_probs.append(float(cur_prob)) + + sample[Fields.stats][StatsKeys.video_watermark_prob] = watermark_probs + + if not context: + for vid_key in videos: + close_video(videos[vid_key]) + + return sample + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_watermark_prob in sample[Fields.stats]: diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index 03ac7e6171..6bc1bccc4b 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -1,6 +1,7 @@ import math import re from itertools import chain +import time # 导入time模块 from pydantic import NonNegativeFloat, NonNegativeInt @@ -69,7 +70,7 @@ def __init__( if detector not in self.avaliable_detectors: raise ValueError( f"Scene detector {detector} is not supported. " - f"Can only be one of {list(self.avaliable_detectors.keys())}" + f"Can only be one of {list(self.avaliable_detectors.keys())}." ) self.detector = detector @@ -83,59 +84,74 @@ def __init__( self.detector_kwargs = {key: kwargs[key] for key in avaliable_kwargs if key in kwargs} def process_single(self, sample, context=False): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - output_video_keys = {} - scene_counts = {} - - for video_key in loaded_video_keys: - # skip duplicate - if video_key in output_video_keys: - continue - - redirected_video_key = transfer_filename(video_key, OP_NAME, **self._init_parameters) - output_template = add_suffix_to_filename(redirected_video_key, "_$SCENE_NUMBER") - - # detect scenes - detector = self.detector_class(self.threshold, self.min_scene_len, **self.detector_kwargs) - scene_list = scenedetect.detect(video_key, detector, show_progress=self.show_progress, start_in_scene=True) - scene_counts[video_key] = len(scene_list) - - if len(scene_list) > 1: - # sync with split_video_ffmpeg internal - scene_num_format = f"%0{max(3, math.floor(math.log(len(scene_list), 10)) + 1)}d" # noqa: E501 - output_video_keys[video_key] = [ - output_template.replace("$SCENE_NUMBER", scene_num_format % (i + 1)) for i in range(len(scene_list)) - ] - # split video into clips - scenedetect.split_video_ffmpeg( - input_video_path=video_key, - scene_list=scene_list, - output_file_template=output_template, - show_progress=self.show_progress, + # 打开log.txt文件进行写入日志 + with open("log.txt", "a") as log_file: + # 记录开始时间 + start_time = time.time() + + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + sample[Fields.source_file] = [] + log_file.write(f"[{time.ctime()}] No video found in sample.\n") + return sample + + # load videos + loaded_video_keys = sample[self.video_key] + output_video_keys = {} + scene_counts = {} + + for video_key in loaded_video_keys: + # skip duplicate + if video_key in output_video_keys: + continue + + redirected_video_key = transfer_filename(video_key, OP_NAME, **self._init_parameters) + output_template = add_suffix_to_filename(redirected_video_key, "_$SCENE_NUMBER") + + # detect scenes + detector = self.detector_class(self.threshold, self.min_scene_len, **self.detector_kwargs) + scene_list = scenedetect.detect(video_key, detector, show_progress=self.show_progress, start_in_scene=True) + scene_counts[video_key] = len(scene_list) + + # 记录视频处理的起始时间和结束时间 + if len(scene_list) > 1: + scene_num_format = f"%0{max(3, math.floor(math.log(len(scene_list), 10)) + 1)}d" + output_video_keys[video_key] = [ + output_template.replace("$SCENE_NUMBER", scene_num_format % (i + 1)) for i in range(len(scene_list)) + ] + # split video into clips + scenedetect.split_video_ffmpeg( + input_video_path=video_key, + scene_list=scene_list, + output_file_template=output_template, + show_progress=self.show_progress, + ) + + log_file.write(f"[{time.ctime()}] Video '{video_key}' processed, {len(scene_list)} scenes detected.\n") + else: + output_video_keys[video_key] = [video_key] + log_file.write(f"[{time.ctime()}] Video '{video_key}' processed, 1 scene detected.\n") + + # replace split video tokens + if self.text_key in sample: + scene_counts_iter = iter([scene_counts[key] for key in loaded_video_keys]) + updated_text = re.sub( + re.escape(SpecialTokens.video), + lambda match: replace_func(match, scene_counts_iter), + sample[self.text_key], ) - else: - output_video_keys[video_key] = [video_key] - - # replace split video tokens - if self.text_key in sample: - scene_counts_iter = iter([scene_counts[key] for key in loaded_video_keys]) - updated_text = re.sub( - re.escape(SpecialTokens.video), - lambda match: replace_func(match, scene_counts_iter), - sample[self.text_key], - ) - sample[self.text_key] = updated_text + sample[self.text_key] = updated_text + + # when the file is modified, its source file needs to be updated. + sample[Fields.source_file] = [] + for value in loaded_video_keys: + sample[Fields.source_file].extend([value] * len(output_video_keys[value])) + + sample[self.video_key] = list(chain.from_iterable([output_video_keys[key] for key in loaded_video_keys])) - # when the file is modified, its source file needs to be updated. - sample[Fields.source_file] = [] - for value in loaded_video_keys: - sample[Fields.source_file].extend([value] * len(output_video_keys[value])) + # 记录处理结束时间和耗时 + end_time = time.time() + elapsed_time = end_time - start_time + log_file.write(f"[{time.ctime()}] Video processing for {', '.join(loaded_video_keys)} completed. Time taken: {elapsed_time:.2f} seconds.\n\n") - sample[self.video_key] = list(chain.from_iterable([output_video_keys[key] for key in loaded_video_keys])) return sample diff --git a/demos/process_video_on_ray/configs/pr_demo.yaml b/demos/process_video_on_ray/configs/pr_demo.yaml index ffde4c3e93..12e7256837 100644 --- a/demos/process_video_on_ray/configs/pr_demo.yaml +++ b/demos/process_video_on_ray/configs/pr_demo.yaml @@ -8,19 +8,21 @@ ray_address: 'auto' # change to your ray cluster address, e. dataset: configs: - type: local - path: './demos/process_video_on_ray/data/demo-dataset.jsonl' # path to your dataset directory or file -export_path: './outputs/demo/process_video_on_ray/demo-dataset' + # path: './demos/process_video_on_ray/data/my-dataset_4.jsonl' # path to your dataset directory or file + # path: './demos/process_video_on_ray/data/demo-dataset.jsonl' # path to your dataset directory or file + path: './demos/process_video_on_ray/data/video_metadata.jsonl' +export_path: './outputs/demo/process_video_on_ray/video_metadata' # process schedule # a list of several process operators with their arguments process: # Mapper ops - - video_split_by_scene_mapper: # split videos into scene clips - detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`] - threshold: 27.0 # threshold passed to the detector - min_scene_len: 10 # minimum length of any scene - show_progress: false # whether to show progress from scenedetect + # - video_split_by_scene_mapper: # split videos into scene clips + # detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`] + # threshold: 27.0 # threshold passed to the detector + # min_scene_len: 10 # minimum length of any scene + # show_progress: false # whether to show progress from scenedetect # # Filter ops - video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos. hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor @@ -31,6 +33,16 @@ process: reduce_mode: avg # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min']. any_or_all: any # keep this sample when any/all images meet the filter condition mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched + gpu_required: 0.5 + - video_watermark_filter: # filter samples according to the predicted watermark probabilities of videos in them + hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification + prob_threshold: 0.8 # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept. + frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". + frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. + reduce_mode: avg # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min']. + any_or_all: any # keep this sample when any/all images meet the filter condition + mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched + gpu_required: 0.5 - video_captioning_from_frames_mapper: # generate samples whose captions are generated based on an image-to-text model and sampled video frames. Captions from different frames will be concatenated to a single string. hf_img2seq: 'Salesforce/blip2-opt-2.7b' # image-to-text model name on huggingface to generate caption caption_num: 1 # how many candidate captions to generate for each video From fc8968a591c65aeb97ab04865971edc75761bb67 Mon Sep 17 00:00:00 2001 From: xcy Date: Fri, 8 Aug 2025 15:36:55 +0800 Subject: [PATCH 06/16] update 0808 --- data_juicer/core/data/dj_dataset.py | 4 + data_juicer/core/data/ray_dataset.py | 434 +++++++++++++----- data_juicer/core/executor/ray_executor.py | 4 + data_juicer/core/ray_actor.py | 1 + data_juicer/ops/base_op.py | 1 + .../ops/filter/video_aesthetics_filter.py | 1 - .../ops/filter/video_watermark_filter.py | 1 - .../video_captioning_from_frames_mapper.py | 1 + .../ops/mapper/video_split_by_scene_mapper.py | 119 +++-- .../process_video_on_ray/configs/pr_demo.yaml | 18 +- 10 files changed, 385 insertions(+), 199 deletions(-) diff --git a/data_juicer/core/data/dj_dataset.py b/data_juicer/core/data/dj_dataset.py index b2d792e6b9..4ed99b8433 100644 --- a/data_juicer/core/data/dj_dataset.py +++ b/data_juicer/core/data/dj_dataset.py @@ -36,6 +36,10 @@ class DJDataset(ABC): def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: # TODO: add type hint """process a list of operators on the dataset.""" + @abstractmethod + def process_parallel(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + """Implementing op parallel data processing based on Ray Actor""" + @abstractmethod def schema(self) -> Schema: """Get dataset schema. diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index 5a86e6902e..c50c3aa895 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -1,5 +1,7 @@ from __future__ import annotations +from collections import defaultdict +from datetime import datetime import os from functools import partial import queue @@ -165,6 +167,7 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) - self.data = self.data.materialize() return self + def process_parallel(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: if operators is None: return self @@ -219,9 +222,9 @@ def process_batch_arrow(table: pyarrow.Table): logger.info(f"Operator {op._name} has {len(actors[op._name])} actor(s).") - # Step 2: 设置每个 operator 的 batch size 为其 actor 数量 + # Step 2: 设置每个 operator 的 batch size batch_sizes = { - op._name: max(1, len(actors[op._name])) + op._name: op.batch_size if hasattr(op, 'batch_size') else 1 for op in operators } @@ -229,96 +232,198 @@ def process_batch_arrow(table: pyarrow.Table): # Step 3: 如果只有一个 operator,单独处理 if len(operators) == 1: - return self._process_single_operator(operators[0], actors[operators[0]._name], batch_sizes[operators[0]._name]) + return self._process_single_operator_streaming(operators[0], actors[operators[0]._name], batch_sizes[operators[0]._name]) + + # Step 4: 为每个actor创建独立的数据队列和终止计数器 + actor_queues = {} + termination_counters = {} + for op in operators: + actor_queues[op._name] = [] + termination_counters[op._name] = { + 'count': 0, + 'lock': threading.Lock(), + 'total': len(actors[op._name]) + } + for i, actor in enumerate(actors[op._name]): + actor_queues[op._name].append(queue.Queue(maxsize=50)) - # Step 4: 构建 op_buckets 和结果缓存 - op_buckets = {op._name: queue.Queue(maxsize=100) for op in operators[1:]} final_results = [] result_lock = threading.Lock() - # Step 5: 启动 consumer 线程(从第二个 operator 开始) + # Step 5: 为每个actor启动独立的处理线程 threads = [] - for idx, op in enumerate(operators[1:], start=1): - for actor in actors[op._name]: + for idx, op in enumerate(operators): + for i, actor in enumerate(actors[op._name]): thread = threading.Thread( - target=self._process_operator, - args=(idx, op, actor, op_buckets, actors, operators, final_results, result_lock), - name=f"processor_{op._name}_{actor._ray_actor_id.hex()[:6]}", + target=self._process_actor_streaming, + args=(idx, op, actor, i, actor_queues, actors, operators, final_results, + result_lock, batch_sizes[op._name], termination_counters), + name=f"actor_{op._name}_{i}", daemon=True ) thread.start() threads.append(thread) - # Step 6: 首个 operator 执行数据提交 - first_op = operators[0] - first_op_actors = actors[first_op._name] - actor_index = 0 - - try: - for batch in self.data.iter_batches( - batch_size=batch_sizes[first_op._name], - batch_format="pyarrow" - ): - futures = [] - for row_idx in range(len(batch)): - row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} - actor = first_op_actors[actor_index % len(first_op_actors)] - actor_index += 1 - futures.append(self._submit_to_actor(first_op, actor, row_data)) - - results = ray.get(futures) - for result in results: - if len(operators) > 1: - op_buckets[operators[1]._name].put(result) - else: - with result_lock: - if isinstance(result, list): - final_results.extend(result) - elif result is not None: - final_results.append(result) + # Step 6: 数据分发线程 - 将数据分发给第一个operator的actors + def data_distributor(): + first_op = operators[0] + first_op_queues = actor_queues[first_op._name] + actor_index = 0 + row_counter = 0 # 全局行号计数器 + + try: + for batch in self.data.iter_batches(batch_size=1, batch_format="pyarrow"): + for row_idx in range(len(batch)): + row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} + row_data['_row_id'] = row_counter # 添加行号到数据中 + row_counter += 1 + + # 轮询分发给不同的actor队列 + target_queue = first_op_queues[actor_index % len(first_op_queues)] + target_queue.put(row_data) + actor_index += 1 + + except Exception as e: + logger.error(f"Error in data distributor: {e}") + finally: + # 通知所有第一个operator的actor结束 + for actor_queue in first_op_queues: + actor_queue.put(None) - except Exception as e: - logger.error(f"Error processing data: {e}") - raise + # 启动数据分发线程 + distributor_thread = threading.Thread(target=data_distributor, daemon=True) + distributor_thread.start() - # Step 7: 通知下游结束,等待线程完成 - for _ in range(len(actors[operators[1]._name])): - op_buckets[operators[1]._name].put(None) + # 等待分发完成 + distributor_thread.join() + # 等待所有处理线程完成 for thread in threads: thread.join() if final_results: + # print("\nFinal Res:", final_results) self.data = from_items(final_results) return self + + def _process_actor_streaming(self, op_idx, op, actor, actor_id, actor_queues, actors, operators, + final_results, result_lock, batch_size, termination_counters): + """流式处理actor数据,带数据流向跟踪""" + op_name = op._name + input_queue = actor_queues[op_name][actor_id] + + # 确定输出目标 + next_op_queues = None + if op_idx + 1 < len(operators): + next_op_name = operators[op_idx + 1]._name + next_op_queues = actor_queues[next_op_name] + + logger.info(f"Starting streaming processor for {op_name} actor {actor_id}") + processed_count = 0 + batch_buffer = [] + next_actor_index = 0 + + # 数据流向跟踪函数 + def log_data_flow(row_id, action, start_time=None): + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + if action == "start": + logger.info(f"[DataFlow] Row {row_id} | {op_name}_actor_{actor_id} | START | {timestamp}") + elif action == "end": + duration = (time.time() - start_time) + logger.info(f"[DataFlow] Row {row_id} | {op_name}_actor_{actor_id} | END | {timestamp} | Duration: {duration:.3f} s") + + while True: + try: + data_item = input_queue.get(timeout=30.0) + if data_item is None: + # 处理剩余的batch数据 + if batch_buffer: + self._process_and_forward_batch( + op, actor, batch_buffer, next_op_queues, final_results, + result_lock, next_actor_index, log_data_flow + ) + next_actor_index += len(batch_buffer) + + # 更新终止计数器 + with termination_counters[op_name]['lock']: + termination_counters[op_name]['count'] += 1 + current_count = termination_counters[op_name]['count'] + total_actors = termination_counters[op_name]['total'] + + # 只有当所有actor都收到None时才通知下游 + if current_count >= total_actors and next_op_queues: + for q in next_op_queues: + q.put(None) + + break + + # 获取行号,如果没有则使用"unknown" + row_id = data_item.get('_row_id', 'unknown') + start_time = time.time() + log_data_flow(row_id, "start", start_time) + + batch_buffer.append((data_item, start_time, row_id)) # 保存数据、开始时间和行号 + + # 当batch满了时处理,或者对于非批处理操作立即处理 + if len(batch_buffer) >= batch_size or not op.is_batched_op(): + results_count = self._process_and_forward_batch( + op, actor, batch_buffer, next_op_queues, final_results, + result_lock, next_actor_index, log_data_flow + ) + next_actor_index += results_count + processed_count += len(batch_buffer) + batch_buffer = [] + + except queue.Empty: + # 超时时处理已有的batch数据 + if batch_buffer: + results_count = self._process_and_forward_batch( + op, actor, batch_buffer, next_op_queues, final_results, + result_lock, next_actor_index, log_data_flow + ) + next_actor_index += results_count + processed_count += len(batch_buffer) + batch_buffer = [] + continue + except Exception as e: + logger.error(f"Error in {op_name} actor {actor_id}: {e}") + break + + logger.info(f"Streaming processor for {op_name} actor {actor_id} completed, processed {processed_count} items") - - def _process_single_operator(self, op, op_actors, batch_size): - final_results = [] - actor_index = 0 - - logger.info(f"Single operator {op._name} running with batch_size = {batch_size}") - - for batch in self.data.iter_batches(batch_size=batch_size, batch_format="pyarrow"): - futures = [] - for row_idx in range(len(batch)): - row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} - actor = op_actors[actor_index % len(op_actors)] - actor_index += 1 - futures.append(self._submit_to_actor(op, actor, row_data)) - results = ray.get(futures) - for result in results: - if isinstance(result, list): - final_results.extend(result) - elif result is not None: - final_results.append(result) - - if final_results: - self.data = from_items(final_results) - - return self - + def _process_batch(self, op, actor, batch_data, final_results, result_lock): + """处理一个batch的数据""" + if not batch_data: + return + + try: + if len(batch_data) == 1: + # 单条数据处理 + future = self._submit_to_actor(op, actor, batch_data[0]) + results = ray.get(future) + else: + # 批量数据处理 + futures = [self._submit_to_actor(op, actor, item) for item in batch_data] + results = ray.get(futures) + # 展平结果 + flattened_results = [] + for result in results: + if isinstance(result, list): + flattened_results.extend(result) + elif result is not None: + flattened_results.append(result) + results = flattened_results + + # 保存最终结果 + with result_lock: + if isinstance(results, list): + final_results.extend(results) + elif results is not None: + final_results.append(results) + + except Exception as e: + logger.error(f"Error processing batch: {e}") def _submit_to_actor(self, op, actor, data_item): if isinstance(op, Mapper): @@ -334,66 +439,155 @@ def _submit_to_actor(self, op, actor, data_item): else: return actor.filter_cpu_batched.remote(data_item) if op.is_batched_op() else actor.filter_cpu_single.remote(data_item) - - def _process_operator(self, op_idx, op, actor, op_buckets, actors, operators, final_results, result_lock): - op_name = op._name - input_queue = op_buckets[op_name] - output_queue = op_buckets.get(operators[op_idx + 1]._name) if op_idx + 1 < len(operators) else None - - logger.info(f"Starting processor for {op_name} actor {actor._ray_actor_id.hex()[:6]}") + def _process_and_forward_batch(self, op, actor, batch_data_with_metadata, next_op_queues, + final_results, result_lock, next_actor_index, log_data_flow): + """处理batch数据并转发到下游,带数据流向跟踪""" + if not batch_data_with_metadata: + return 0 + + # 分离数据、开始时间和行号 + batch_data = [item[0] for item in batch_data_with_metadata] + start_times = [item[1] for item in batch_data_with_metadata] + row_ids = [item[2] for item in batch_data_with_metadata] + + try: + if len(batch_data) == 1: + # 单条数据处理 + future = self._submit_to_actor(op, actor, batch_data[0]) + results = ray.get(future) + else: + # 批量数据处理 + futures = [self._submit_to_actor(op, actor, item) for item in batch_data] + results = ray.get(futures) + # 展平结果 + flattened_results = [] + for result in results: + if isinstance(result, list): + flattened_results.extend(result) + elif result is not None: + flattened_results.append(result) + results = flattened_results + + # 处理结果 + valid_results = [] + if isinstance(op, Mapper): + if isinstance(results, list): + valid_results = results + elif results is not None: + valid_results = [results] + elif isinstance(op, Filter): + if results: # Filter返回True的数据 + if isinstance(results, list): + valid_results = results + else: + valid_results = [results] + + # 记录处理结束时间 + for row_id, start_time in zip(row_ids, start_times): + log_data_flow(row_id, "end", start_time) + + # 转发到下游或保存最终结果 + if next_op_queues and valid_results: + # 轮询分发到下游actor + for i, result in enumerate(valid_results): + try: + # 保持行号传递到下游 + if isinstance(result, dict): + result['_row_id'] = row_ids[i % len(row_ids)] + + target_queue_idx = (next_actor_index + i) % len(next_op_queues) + next_op_queues[target_queue_idx].put(result) + except Exception as e: + logger.error(f"Error forwarding result to downstream queue: {e}") + elif not next_op_queues and valid_results: + # 最后一个operator,保存最终结果 + with result_lock: + final_results.extend(valid_results) + + return len(valid_results) + + except Exception as e: + # 出错时也记录结束时间 + for row_id, start_time in zip(row_ids, start_times): + log_data_flow(row_id, "end", start_time) + logger.error(f"Error processing and forwarding batch: {e}") + return 0 + + def _process_single_operator_streaming(self, op, op_actors, batch_size): + """流式处理单个operator""" + final_results = [] + result_lock = threading.Lock() + + # 为每个actor创建独立队列 + actor_queues = [queue.Queue(maxsize=50) for _ in op_actors] + + # 启动actor处理线程 + threads = [] + for i, actor in enumerate(op_actors): + thread = threading.Thread( + target=self._process_single_actor, + args=(op, actor, actor_queues[i], final_results, result_lock, batch_size), + daemon=True + ) + thread.start() + threads.append(thread) + + # 分发数据 + actor_index = 0 + for batch in self.data.iter_batches(batch_size=1, batch_format="pyarrow"): + for row_idx in range(len(batch)): + row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} + target_queue = actor_queues[actor_index % len(actor_queues)] + target_queue.put(row_data) + actor_index += 1 + + # 通知结束 + for actor_queue in actor_queues: + actor_queue.put(None) + + # 等待完成 + for thread in threads: + thread.join() + + if final_results: + self.data = from_items(final_results) + + return self + + def _process_single_actor(self, op, actor, input_queue, final_results, result_lock, batch_size): + """处理单个actor的数据""" + batch_buffer = [] processed_count = 0 - start_time = time.time() - + while True: try: data_item = input_queue.get(timeout=30.0) if data_item is None: - if output_queue: - for _ in range(len(actors[operators[op_idx + 1]._name])): - output_queue.put(None) + # 处理剩余的batch数据 + if batch_buffer: + self._process_batch(op, actor, batch_buffer, final_results, result_lock) break - - future = self._submit_to_actor(op, actor, data_item) - results = ray.get(future) - processed_count += 1 - - if isinstance(op, Mapper): - if output_queue: - output_queue.put(results) - else: - with result_lock: - if isinstance(results, list): - final_results.extend(results) - else: - final_results.append(results) - - elif isinstance(op, Filter): - if results: - if output_queue: - if isinstance(results, list): - for result in results: - output_queue.put(result) - else: - output_queue.put(results) - else: - with result_lock: - if isinstance(results, list): - final_results.extend(results) - else: - final_results.append(results) - - input_queue.task_done() - + + batch_buffer.append(data_item) + + # 当batch满了或者是批处理操作时处理 + if len(batch_buffer) >= batch_size or not op.is_batched_op(): + self._process_batch(op, actor, batch_buffer, final_results, result_lock) + batch_buffer = [] + processed_count += len(batch_buffer) if batch_buffer else 1 + except queue.Empty: - logger.warning(f"{op_name} actor {actor._ray_actor_id.hex()[:6]} queue timeout, processed {processed_count} items") + # 超时时处理已有的batch数据 + if batch_buffer: + self._process_batch(op, actor, batch_buffer, final_results, result_lock) + batch_buffer = [] continue except Exception as e: - logger.error(f"Error in {op_name} actor {actor._ray_actor_id.hex()[:6]}: {e}") - input_queue.task_done() + logger.error(f"Error in single actor processing: {e}") break + + logger.info(f"Single actor completed, processed {processed_count} items") - end_time = time.time() - logger.info(f"Processor for {op_name} actor {actor._ray_actor_id.hex()[:6]} completed in {end_time - start_time:.2f}s, processed {processed_count} items") def transform_to_2d_format(self, data): """ @@ -591,7 +785,7 @@ def to_list(self) -> list: return self.data.to_pandas().to_dict(orient="records") -class JSONStreamDatasource(ray.data.read_api.JSONDatasource): +class JSONStreamDatasource(ray.data.read_api.CSVDatasource): """ A temp Datasource for reading json stream. diff --git a/data_juicer/core/executor/ray_executor.py b/data_juicer/core/executor/ray_executor.py index dd95a3bbe1..d011c933ff 100644 --- a/data_juicer/core/executor/ray_executor.py +++ b/data_juicer/core/executor/ray_executor.py @@ -57,7 +57,9 @@ def __init__(self, cfg: Optional[Namespace] = None): # init ray logger.info("Initializing Ray ...") + ray.init(self.cfg.ray_address, ignore_reinit_error=True) + self.tmp_dir = os.path.join(self.work_dir, ".tmp", ray.get_runtime_context().get_job_id()) # absolute path resolution logic @@ -86,7 +88,9 @@ def run(self, load_data_np: Optional[PositiveInt] = None, skip_return=False): """ # 1. load data logger.info("Loading dataset with Ray...") + dstart = time.time() dataset = self.datasetbuilder.load_dataset(num_proc=load_data_np) + logger.info(f"Data loading in {time.time() - dstart:.3f}") columns = dataset.schema().columns # 2. extract processes diff --git a/data_juicer/core/ray_actor.py b/data_juicer/core/ray_actor.py index bdef9d86da..88b83e8ea0 100644 --- a/data_juicer/core/ray_actor.py +++ b/data_juicer/core/ray_actor.py @@ -50,6 +50,7 @@ def filter_cuda_single(self, data): self.load_model() data = self.op.compute_stats_single_actor(data, self.model, self.processor) keep = self.op.process_single(data) + if keep: return data else: diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 2d7db36571..fc4d9db4ec 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -292,6 +292,7 @@ def empty_history(self): return np.empty((0, 0), dtype=str) def load_model(self, rank=None): + start = time.time() start_time = datetime.fromtimestamp(start, pytz.utc).astimezone(beijing_tz) model, processor = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda()) diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index d1ee055580..1412068425 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -20,7 +20,6 @@ OP_NAME = "video_aesthetics_filter" - @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index afc3c0c622..4237757562 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -19,7 +19,6 @@ OP_NAME = "video_watermark_filter" - @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index c96cdf5755..a71c7486e3 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -331,6 +331,7 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): generated_text_candidates_single_chunk[max_index]) return generated_text_per_chunk + def process_batched_actor(self, samples, model, processor, rank=None, context=False): """ :param samples: diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index 6bc1bccc4b..e11a4db5ab 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -84,74 +84,59 @@ def __init__( self.detector_kwargs = {key: kwargs[key] for key in avaliable_kwargs if key in kwargs} def process_single(self, sample, context=False): - # 打开log.txt文件进行写入日志 - with open("log.txt", "a") as log_file: - # 记录开始时间 - start_time = time.time() - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - log_file.write(f"[{time.ctime()}] No video found in sample.\n") - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - output_video_keys = {} - scene_counts = {} - - for video_key in loaded_video_keys: - # skip duplicate - if video_key in output_video_keys: - continue - - redirected_video_key = transfer_filename(video_key, OP_NAME, **self._init_parameters) - output_template = add_suffix_to_filename(redirected_video_key, "_$SCENE_NUMBER") - - # detect scenes - detector = self.detector_class(self.threshold, self.min_scene_len, **self.detector_kwargs) - scene_list = scenedetect.detect(video_key, detector, show_progress=self.show_progress, start_in_scene=True) - scene_counts[video_key] = len(scene_list) - - # 记录视频处理的起始时间和结束时间 - if len(scene_list) > 1: - scene_num_format = f"%0{max(3, math.floor(math.log(len(scene_list), 10)) + 1)}d" - output_video_keys[video_key] = [ - output_template.replace("$SCENE_NUMBER", scene_num_format % (i + 1)) for i in range(len(scene_list)) - ] - # split video into clips - scenedetect.split_video_ffmpeg( - input_video_path=video_key, - scene_list=scene_list, - output_file_template=output_template, - show_progress=self.show_progress, - ) - - log_file.write(f"[{time.ctime()}] Video '{video_key}' processed, {len(scene_list)} scenes detected.\n") - else: - output_video_keys[video_key] = [video_key] - log_file.write(f"[{time.ctime()}] Video '{video_key}' processed, 1 scene detected.\n") - - # replace split video tokens - if self.text_key in sample: - scene_counts_iter = iter([scene_counts[key] for key in loaded_video_keys]) - updated_text = re.sub( - re.escape(SpecialTokens.video), - lambda match: replace_func(match, scene_counts_iter), - sample[self.text_key], - ) - sample[self.text_key] = updated_text - - # when the file is modified, its source file needs to be updated. + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: sample[Fields.source_file] = [] - for value in loaded_video_keys: - sample[Fields.source_file].extend([value] * len(output_video_keys[value])) - - sample[self.video_key] = list(chain.from_iterable([output_video_keys[key] for key in loaded_video_keys])) + return sample + + # load videos + loaded_video_keys = sample[self.video_key] + output_video_keys = {} + scene_counts = {} + + for video_key in loaded_video_keys: + # skip duplicate + if video_key in output_video_keys: + continue + + redirected_video_key = transfer_filename(video_key, OP_NAME, self.save_dir, **self._init_parameters) + output_template = add_suffix_to_filename(redirected_video_key, "_$SCENE_NUMBER") + + # detect scenes + detector = self.detector_class(self.threshold, self.min_scene_len, **self.detector_kwargs) + scene_list = scenedetect.detect(video_key, detector, show_progress=self.show_progress, start_in_scene=True) + scene_counts[video_key] = len(scene_list) + + if len(scene_list) > 1: + # sync with split_video_ffmpeg internal + scene_num_format = f"%0{max(3, math.floor(math.log(len(scene_list), 10)) + 1)}d" # noqa: E501 + output_video_keys[video_key] = [ + output_template.replace("$SCENE_NUMBER", scene_num_format % (i + 1)) for i in range(len(scene_list)) + ] + # split video into clips + scenedetect.split_video_ffmpeg( + input_video_path=video_key, + scene_list=scene_list, + output_file_template=output_template, + show_progress=self.show_progress, + ) + else: + output_video_keys[video_key] = [video_key] + + # replace split video tokens + if self.text_key in sample: + scene_counts_iter = iter([scene_counts[key] for key in loaded_video_keys]) + updated_text = re.sub( + re.escape(SpecialTokens.video), + lambda match: replace_func(match, scene_counts_iter), + sample[self.text_key], + ) + sample[self.text_key] = updated_text - # 记录处理结束时间和耗时 - end_time = time.time() - elapsed_time = end_time - start_time - log_file.write(f"[{time.ctime()}] Video processing for {', '.join(loaded_video_keys)} completed. Time taken: {elapsed_time:.2f} seconds.\n\n") + # when the file is modified, its source file needs to be updated. + sample[Fields.source_file] = [] + for value in loaded_video_keys: + sample[Fields.source_file].extend([value] * len(output_video_keys[value])) + sample[self.video_key] = list(chain.from_iterable([output_video_keys[key] for key in loaded_video_keys])) return sample diff --git a/demos/process_video_on_ray/configs/pr_demo.yaml b/demos/process_video_on_ray/configs/pr_demo.yaml index 12e7256837..b90b113f7b 100644 --- a/demos/process_video_on_ray/configs/pr_demo.yaml +++ b/demos/process_video_on_ray/configs/pr_demo.yaml @@ -8,21 +8,19 @@ ray_address: 'auto' # change to your ray cluster address, e. dataset: configs: - type: local - # path: './demos/process_video_on_ray/data/my-dataset_4.jsonl' # path to your dataset directory or file - # path: './demos/process_video_on_ray/data/demo-dataset.jsonl' # path to your dataset directory or file - path: './demos/process_video_on_ray/data/video_metadata.jsonl' -export_path: './outputs/demo/process_video_on_ray/video_metadata' + path: './demos/process_video_on_ray/data/demo-dataset.jsonl' # path to your dataset directory or file +export_path: './outputs/demo/process_video_on_ray/demo-dataset' # process schedule # a list of several process operators with their arguments process: # Mapper ops - # - video_split_by_scene_mapper: # split videos into scene clips - # detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`] - # threshold: 27.0 # threshold passed to the detector - # min_scene_len: 10 # minimum length of any scene - # show_progress: false # whether to show progress from scenedetect + - video_split_by_scene_mapper: # split videos into scene clips + detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`] + threshold: 27.0 # threshold passed to the detector + min_scene_len: 10 # minimum length of any scene + show_progress: false # whether to show progress from scenedetect # # Filter ops - video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos. hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor @@ -55,4 +53,4 @@ process: horizontal_flip: false # flip frame image horizontally (left to right). vertical_flip: false # flip frame image vertically (top to bottom). mem_required: '20GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched - \ No newline at end of file + gpu_required: 1 \ No newline at end of file From 95de6e3c2e2d16001b5303e4a45fa0d658505b5b Mon Sep 17 00:00:00 2001 From: xcy Date: Tue, 19 Aug 2025 14:48:16 +0800 Subject: [PATCH 07/16] update for code review --- data_juicer/core/data/ray_dataset.py | 12 +++++++----- data_juicer/core/ray_actor.py | 4 ++-- .../ops/mapper/video_split_by_scene_mapper.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index c50c3aa895..22f32bdd07 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -205,7 +205,8 @@ def process_batch_arrow(table: pyarrow.Table): actors = {} for op in operators: op_proc = 1 if op.use_cuda() else calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) - actor_num = min(op_proc, self.data.count()) + # actor_num = min(op_proc, self.data.count()) + actor_num = op_proc actors[op._name] = [] for _ in range(actor_num): @@ -339,11 +340,11 @@ def log_data_flow(row_id, action, start_time=None): if data_item is None: # 处理剩余的batch数据 if batch_buffer: - self._process_and_forward_batch( + results_count = self._process_and_forward_batch( op, actor, batch_buffer, next_op_queues, final_results, result_lock, next_actor_index, log_data_flow ) - next_actor_index += len(batch_buffer) + next_actor_index += results_count # 更新终止计数器 with termination_counters[op_name]['lock']: @@ -572,9 +573,10 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo # 当batch满了或者是批处理操作时处理 if len(batch_buffer) >= batch_size or not op.is_batched_op(): + processed_batch_len = len(batch_buffer) self._process_batch(op, actor, batch_buffer, final_results, result_lock) batch_buffer = [] - processed_count += len(batch_buffer) if batch_buffer else 1 + processed_count += processed_batch_len except queue.Empty: # 超时时处理已有的batch数据 @@ -785,7 +787,7 @@ def to_list(self) -> list: return self.data.to_pandas().to_dict(orient="records") -class JSONStreamDatasource(ray.data.read_api.CSVDatasource): +class JSONStreamDatasource(ray.data.read_api.JSONDatasource): """ A temp Datasource for reading json stream. diff --git a/data_juicer/core/ray_actor.py b/data_juicer/core/ray_actor.py index 88b83e8ea0..9eec2dd797 100644 --- a/data_juicer/core/ray_actor.py +++ b/data_juicer/core/ray_actor.py @@ -59,8 +59,8 @@ def filter_cuda_single(self, data): def filter_cuda_batched(self, data): if not self._model_loaded: self.load_model() - # data = self.op.compute_stats_batched(data, self.model, self.processor) - data = self.op.compute_stats_batched(data) + data = self.op.compute_stats_batched(data, self.model, self.processor) + keep_mask = list(self.op.process_batched(data)) # 将map对象转换为列表 # 如果没有数据需要保留,返回None diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index e11a4db5ab..a2cee3d5e2 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -99,7 +99,7 @@ def process_single(self, sample, context=False): if video_key in output_video_keys: continue - redirected_video_key = transfer_filename(video_key, OP_NAME, self.save_dir, **self._init_parameters) + redirected_video_key = transfer_filename(video_key, OP_NAME, self.work_dir, **self._init_parameters) output_template = add_suffix_to_filename(redirected_video_key, "_$SCENE_NUMBER") # detect scenes From da962a57bd531bcd4af35ebb4c8b293665ba2ac0 Mon Sep 17 00:00:00 2001 From: xcy Date: Tue, 19 Aug 2025 15:02:24 +0800 Subject: [PATCH 08/16] update for conflicts --- data_juicer/ops/base_op.py | 5 +++-- data_juicer/ops/mapper/video_split_by_scene_mapper.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index fc4d9db4ec..97370d7f40 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -8,8 +8,9 @@ from data_juicer import is_cuda_available from data_juicer.utils.constant import Fields -from data_juicer.utils.mm_utils import size_to_bytes -from data_juicer.utils.model_utils import free_models, get_model +from data_juicer.utils.mm_utils import SpecialTokens, size_to_bytes +from data_juicer.utils.model_utils import free_models +from data_juicer.utils.model_utils import get_model from data_juicer.utils.process_utils import calculate_np from data_juicer.utils.registry import Registry diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index a2cee3d5e2..e11a4db5ab 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -99,7 +99,7 @@ def process_single(self, sample, context=False): if video_key in output_video_keys: continue - redirected_video_key = transfer_filename(video_key, OP_NAME, self.work_dir, **self._init_parameters) + redirected_video_key = transfer_filename(video_key, OP_NAME, self.save_dir, **self._init_parameters) output_template = add_suffix_to_filename(redirected_video_key, "_$SCENE_NUMBER") # detect scenes From 8cb420e52327882996270a1188df7ce9b55a8e95 Mon Sep 17 00:00:00 2001 From: xcy Date: Tue, 19 Aug 2025 15:03:32 +0800 Subject: [PATCH 09/16] update for conflicts2 --- data_juicer/ops/base_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 97370d7f40..c2a0b9dea2 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -8,7 +8,7 @@ from data_juicer import is_cuda_available from data_juicer.utils.constant import Fields -from data_juicer.utils.mm_utils import SpecialTokens, size_to_bytes +from data_juicer.utils.mm_utils import SpecialTokens, size_to_bytes from data_juicer.utils.model_utils import free_models from data_juicer.utils.model_utils import get_model from data_juicer.utils.process_utils import calculate_np From aa258209c0652617ecadd192a769d7b05ce45152 Mon Sep 17 00:00:00 2001 From: xcy Date: Wed, 20 Aug 2025 13:44:22 +0800 Subject: [PATCH 10/16] update for pre-commit --- data_juicer/core/RayOperatorWrapper.py | 104 ++++++++ data_juicer/core/data/ray_dataset.py | 251 ++++++++++-------- data_juicer/core/executor/ray_executor.py | 1 + .../ops/filter/video_aesthetics_filter.py | 4 + .../video_captioning_from_frames_mapper.py | 12 +- 5 files changed, 253 insertions(+), 119 deletions(-) create mode 100644 data_juicer/core/RayOperatorWrapper.py diff --git a/data_juicer/core/RayOperatorWrapper.py b/data_juicer/core/RayOperatorWrapper.py new file mode 100644 index 0000000000..7d8fd30d42 --- /dev/null +++ b/data_juicer/core/RayOperatorWrapper.py @@ -0,0 +1,104 @@ +import pyarrow +import ray + +@ray.remote(num_gpus=0.0) +class Actor: + def __init__(self, op, rank=None): + + self.op = op + self._model_loaded = False # taggle to check if model is loaded + self.rank = rank + self.model = None + self.processor = None + + def load_model(self): + + if self.op.use_cuda() and not self._model_loaded: + + self.model, self.processor = self.op.load_model(rank=self.rank) + self._model_loaded = True + + def mapper_cuda(self, data): + if not self._model_loaded: + self.load_model() # ensure model is loaded before processing + # process data + data = self.op.process_single_actor(data, self.model, self.processor) + return data + + def mapper_cuda_batched(self, data): + if not self._model_loaded: + self.load_model() # ensure model is loaded before processing + # process data + data = self.op.process_batched_actor(data, self.model, self.processor) + return data + + def mapper_cpu(self, data): + # process data + processed_data = self.op.process_single(data) + return processed_data + + def filter_cuda_single(self, data): + if not self._model_loaded: + self.load_model() + # Call the Filter operator function + data = self.op.compute_stats_single_actor(data, self.model, self.processor) + keep = self.op.process_single(data) + + if keep: + return data + else: + return None + + def filter_cuda_batched(self, data): + if not self._model_loaded: + self.load_model() + data = self.op.compute_stats_batched(data, self.model, self.processor) + # transform the map object to a list + keep_mask = list(self.op.process_batched(data)) + + if not any(keep_mask): + return None + + # filter data based on the keep_mask + if isinstance(data, dict): + filtered_data = { + key: [value for value, keep in zip(values, keep_mask) if keep] for key, values in data.items() + } + elif isinstance(data, list): + filtered_data = [item for item, keep in zip(data, keep_mask) if keep] + else: + raise ValueError("Unsupported data type for batch filtering") + + return filtered_data + + + def filter_cpu_single(self, data): + data = self.op.compute_stats_single(data) + keep = self.op.process_single(data) + if keep: + return data + else: + return None + + def filter_cpu_batched(self, data): + data = self.op.compute_stats_batched(data) + # transform the map object to a list + keep_mask = list(self.op.process_batched(data)) + + if not any(keep_mask): + return None + + # filter data based on the keep_mask + if isinstance(data, dict): + filtered_data = { + key: [value for value, keep in zip(values, keep_mask) if keep] for key, values in data.items() + } + elif isinstance(data, list): + + filtered_data = [item for item, keep in zip(data, keep_mask) if keep] + else: + raise ValueError("Unsupported data type for batch filtering") + + return filtered_data + + diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index 22f32bdd07..15c2cc0436 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -9,8 +9,7 @@ import time from typing import Any, Dict, List, Literal, Optional, Union import uuid -import numpy -from data_juicer.core.ray_actor import Actor +from data_juicer.core.RayOperatorWrapper import Actor import pyarrow from jsonargparse import Namespace from loguru import logger @@ -169,12 +168,16 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) - def process_parallel(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + """ + Process the dataset in parallel using multiple operators. + This method creates actors for each operator and processes the dataset in a streaming manner. + """ if operators is None: return self if not isinstance(operators, list): operators = [operators] - # 添加meta和stats列(如果需要) + # Add meta and stats columns if needed add_meta = False add_stats = False for op in operators: @@ -201,7 +204,7 @@ def process_batch_arrow(table: pyarrow.Table): self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") - # Step 1: 创建所有 operator 的 actor + # Step 1: Create actors for all operators actors = {} for op in operators: op_proc = 1 if op.use_cuda() else calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) @@ -223,19 +226,16 @@ def process_batch_arrow(table: pyarrow.Table): logger.info(f"Operator {op._name} has {len(actors[op._name])} actor(s).") - # Step 2: 设置每个 operator 的 batch size - batch_sizes = { - op._name: op.batch_size if hasattr(op, 'batch_size') else 1 - for op in operators - } + # Step 2: Set batch size for each operator + batch_sizes = {op._name: op.batch_size if hasattr(op, "batch_size") else 1 for op in operators} logger.info(f"Batch sizes per operator: {batch_sizes}") - # Step 3: 如果只有一个 operator,单独处理 + # Step 3: Process single operator streaming if len(operators) == 1: return self._process_single_operator_streaming(operators[0], actors[operators[0]._name], batch_sizes[operators[0]._name]) - # Step 4: 为每个actor创建独立的数据队列和终止计数器 + # Step 4: Create queues for each actor and termination counters actor_queues = {} termination_counters = {} for op in operators: @@ -251,7 +251,7 @@ def process_batch_arrow(table: pyarrow.Table): final_results = [] result_lock = threading.Lock() - # Step 5: 为每个actor启动独立的处理线程 + # Step 5: Start processing threads for each operator's actors threads = [] for idx, op in enumerate(operators): for i, actor in enumerate(actors[op._name]): @@ -265,21 +265,21 @@ def process_batch_arrow(table: pyarrow.Table): thread.start() threads.append(thread) - # Step 6: 数据分发线程 - 将数据分发给第一个operator的actors + # Step 6: Data distributor function to distribute data to actors def data_distributor(): first_op = operators[0] first_op_queues = actor_queues[first_op._name] actor_index = 0 - row_counter = 0 # 全局行号计数器 + row_counter = 0 # Initialize row counter try: for batch in self.data.iter_batches(batch_size=1, batch_format="pyarrow"): for row_idx in range(len(batch)): row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} - row_data['_row_id'] = row_counter # 添加行号到数据中 + row_data["_row_id"] = row_counter row_counter += 1 - # 轮询分发给不同的actor队列 + # distribute data to actors in a round-robin manner target_queue = first_op_queues[actor_index % len(first_op_queues)] target_queue.put(row_data) actor_index += 1 @@ -287,30 +287,38 @@ def data_distributor(): except Exception as e: logger.error(f"Error in data distributor: {e}") finally: - # 通知所有第一个operator的actor结束 for actor_queue in first_op_queues: actor_queue.put(None) - # 启动数据分发线程 + # start data distributor thread distributor_thread = threading.Thread(target=data_distributor, daemon=True) distributor_thread.start() - # 等待分发完成 distributor_thread.join() - # 等待所有处理线程完成 + # wait for all processing threads to finish for thread in threads: thread.join() if final_results: - # print("\nFinal Res:", final_results) - self.data = from_items(final_results) + self.data = ray.data.from_items(final_results) return self - def _process_actor_streaming(self, op_idx, op, actor, actor_id, actor_queues, actors, operators, - final_results, result_lock, batch_size, termination_counters): - """流式处理actor数据,带数据流向跟踪""" + def _process_actor_streaming( + self, + op_idx, + op, + actor, + actor_id, + actor_queues, + operators, + final_results, + result_lock, + batch_size, + termination_counters, + ): + """Process data for a single operator actor in a streaming manner.""" op_name = op._name input_queue = actor_queues[op_name][actor_id] @@ -325,7 +333,7 @@ def _process_actor_streaming(self, op_idx, op, actor, actor_id, actor_queues, ac batch_buffer = [] next_actor_index = 0 - # 数据流向跟踪函数 + # data flow logging function def log_data_flow(row_id, action, start_time=None): timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] if action == "start": @@ -333,40 +341,67 @@ def log_data_flow(row_id, action, start_time=None): elif action == "end": duration = (time.time() - start_time) logger.info(f"[DataFlow] Row {row_id} | {op_name}_actor_{actor_id} | END | {timestamp} | Duration: {duration:.3f} s") - + + def flush_buffer(current_next_actor_index): + """ + 辅助函数,用于处理并清空缓冲区。 + 它会更新外部的 processed_count 并返回成功转发的结果数量。 + """ + if not batch_buffer: + return 0 + + nonlocal processed_count + items_in_buffer = len(batch_buffer) + + results_count = self._process_and_forward_batch( + op, actor, batch_buffer, next_op_queues, final_results, + result_lock, current_next_actor_index, log_data_flow + ) + + # 核心修正:无论处理结果如何,处理过的项目数都应该增加 + processed_count += items_in_buffer + batch_buffer.clear() # 清空缓冲区 + + return results_count + while True: try: - data_item = input_queue.get(timeout=30.0) + # 将超时时间设置得短一些,可以更频繁地处理因超时而触发的批次 + data_item = input_queue.get(timeout=5.0) + if data_item is None: - # 处理剩余的batch数据 if batch_buffer: results_count = self._process_and_forward_batch( - op, actor, batch_buffer, next_op_queues, final_results, - result_lock, next_actor_index, log_data_flow + op, + actor, + batch_buffer, + next_op_queues, + final_results, + result_lock, + next_actor_index, + log_data_flow, ) next_actor_index += results_count - # 更新终止计数器 - with termination_counters[op_name]['lock']: - termination_counters[op_name]['count'] += 1 - current_count = termination_counters[op_name]['count'] - total_actors = termination_counters[op_name]['total'] - - # 只有当所有actor都收到None时才通知下游 + # update termination counter + with termination_counters[op_name]["lock"]: + termination_counters[op_name]["count"] += 1 + current_count = termination_counters[op_name]["count"] + total_actors = termination_counters[op_name]["total"] + + # notify the downstream only when all actors have received None if current_count >= total_actors and next_op_queues: for q in next_op_queues: q.put(None) break - - # 获取行号,如果没有则使用"unknown" - row_id = data_item.get('_row_id', 'unknown') + + row_id = data_item.get("_row_id", "unknown") start_time = time.time() log_data_flow(row_id, "start", start_time) - batch_buffer.append((data_item, start_time, row_id)) # 保存数据、开始时间和行号 - - # 当batch满了时处理,或者对于非批处理操作立即处理 + batch_buffer.append((data_item, start_time, row_id)) + if len(batch_buffer) >= batch_size or not op.is_batched_op(): results_count = self._process_and_forward_batch( op, actor, batch_buffer, next_op_queues, final_results, @@ -377,11 +412,16 @@ def log_data_flow(row_id, action, start_time=None): batch_buffer = [] except queue.Empty: - # 超时时处理已有的batch数据 if batch_buffer: results_count = self._process_and_forward_batch( - op, actor, batch_buffer, next_op_queues, final_results, - result_lock, next_actor_index, log_data_flow + op, + actor, + batch_buffer, + next_op_queues, + final_results, + result_lock, + next_actor_index, + log_data_flow, ) next_actor_index += results_count processed_count += len(batch_buffer) @@ -394,20 +434,18 @@ def log_data_flow(row_id, action, start_time=None): logger.info(f"Streaming processor for {op_name} actor {actor_id} completed, processed {processed_count} items") def _process_batch(self, op, actor, batch_data, final_results, result_lock): - """处理一个batch的数据""" + """Process a batch of data with the given operator and actor.""" if not batch_data: return try: if len(batch_data) == 1: - # 单条数据处理 future = self._submit_to_actor(op, actor, batch_data[0]) results = ray.get(future) else: - # 批量数据处理 futures = [self._submit_to_actor(op, actor, item) for item in batch_data] results = ray.get(futures) - # 展平结果 + # flatten results flattened_results = [] for result in results: if isinstance(result, list): @@ -415,8 +453,7 @@ def _process_batch(self, op, actor, batch_data, final_results, result_lock): elif result is not None: flattened_results.append(result) results = flattened_results - - # 保存最终结果 + with result_lock: if isinstance(results, list): final_results.extend(results) @@ -436,31 +473,46 @@ def _submit_to_actor(self, op, actor, data_item): elif isinstance(op, Filter): if op.use_cuda(): - return actor.filter_cuda_batched.remote(data_item) if op.is_batched_op() else actor.filter_cuda_single.remote(data_item) + return ( + actor.filter_cuda_batched.remote(data_item) + if op.is_batched_op() + else actor.filter_cuda_single.remote(data_item) + ) else: - return actor.filter_cpu_batched.remote(data_item) if op.is_batched_op() else actor.filter_cpu_single.remote(data_item) + return ( + actor.filter_cpu_batched.remote(data_item) + if op.is_batched_op() + else actor.filter_cpu_single.remote(data_item) + ) - def _process_and_forward_batch(self, op, actor, batch_data_with_metadata, next_op_queues, - final_results, result_lock, next_actor_index, log_data_flow): - """处理batch数据并转发到下游,带数据流向跟踪""" + def _process_and_forward_batch( + self, + op, + actor, + batch_data_with_metadata, + next_op_queues, + final_results, + result_lock, + next_actor_index, + log_data_flow, + ): + """Process batch data and forward to downstream with data flow tracking""" if not batch_data_with_metadata: return 0 - # 分离数据、开始时间和行号 + # separate the data, start time, and line number batch_data = [item[0] for item in batch_data_with_metadata] start_times = [item[1] for item in batch_data_with_metadata] row_ids = [item[2] for item in batch_data_with_metadata] try: if len(batch_data) == 1: - # 单条数据处理 future = self._submit_to_actor(op, actor, batch_data[0]) results = ray.get(future) else: - # 批量数据处理 futures = [self._submit_to_actor(op, actor, item) for item in batch_data] results = ray.get(futures) - # 展平结果 + # flatten results flattened_results = [] for result in results: if isinstance(result, list): @@ -469,7 +521,6 @@ def _process_and_forward_batch(self, op, actor, batch_data_with_metadata, next_o flattened_results.append(result) results = flattened_results - # 处理结果 valid_results = [] if isinstance(op, Mapper): if isinstance(results, list): @@ -477,22 +528,20 @@ def _process_and_forward_batch(self, op, actor, batch_data_with_metadata, next_o elif results is not None: valid_results = [results] elif isinstance(op, Filter): - if results: # Filter返回True的数据 + if results: if isinstance(results, list): valid_results = results else: valid_results = [results] - # 记录处理结束时间 for row_id, start_time in zip(row_ids, start_times): log_data_flow(row_id, "end", start_time) - - # 转发到下游或保存最终结果 + if next_op_queues and valid_results: - # 轮询分发到下游actor + # distribute results to next operator queues for i, result in enumerate(valid_results): try: - # 保持行号传递到下游 + # add row_id to result if needed if isinstance(result, dict): result['_row_id'] = row_ids[i % len(row_ids)] @@ -501,28 +550,26 @@ def _process_and_forward_batch(self, op, actor, batch_data_with_metadata, next_o except Exception as e: logger.error(f"Error forwarding result to downstream queue: {e}") elif not next_op_queues and valid_results: - # 最后一个operator,保存最终结果 with result_lock: final_results.extend(valid_results) return len(valid_results) except Exception as e: - # 出错时也记录结束时间 for row_id, start_time in zip(row_ids, start_times): log_data_flow(row_id, "end", start_time) logger.error(f"Error processing and forwarding batch: {e}") return 0 def _process_single_operator_streaming(self, op, op_actors, batch_size): - """流式处理单个operator""" + """Stream processing for a single operator.""" final_results = [] result_lock = threading.Lock() - # 为每个actor创建独立队列 + # create an independent queue for each actor. actor_queues = [queue.Queue(maxsize=50) for _ in op_actors] - - # 启动actor处理线程 + + # start the actor processing threads threads = [] for i, actor in enumerate(op_actors): thread = threading.Thread( @@ -532,8 +579,7 @@ def _process_single_operator_streaming(self, op, op_actors, batch_size): ) thread.start() threads.append(thread) - - # 分发数据 + actor_index = 0 for batch in self.data.iter_batches(batch_size=1, batch_format="pyarrow"): for row_idx in range(len(batch)): @@ -542,11 +588,10 @@ def _process_single_operator_streaming(self, op, op_actors, batch_size): target_queue.put(row_data) actor_index += 1 - # 通知结束 + # notify all Actors to terminate. for actor_queue in actor_queues: actor_queue.put(None) - # 等待完成 for thread in threads: thread.join() @@ -556,7 +601,10 @@ def _process_single_operator_streaming(self, op, op_actors, batch_size): return self def _process_single_actor(self, op, actor, input_queue, final_results, result_lock, batch_size): - """处理单个actor的数据""" + """ + Process data for a single actor in a streaming manner. + This function continuously reads data from the input queue, processes it, and stores the results. + """ batch_buffer = [] processed_count = 0 @@ -564,14 +612,13 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo try: data_item = input_queue.get(timeout=30.0) if data_item is None: - # 处理剩余的batch数据 if batch_buffer: self._process_batch(op, actor, batch_buffer, final_results, result_lock) break batch_buffer.append(data_item) - # 当batch满了或者是批处理操作时处理 + # process when the batch is full or during batch processing operations. if len(batch_buffer) >= batch_size or not op.is_batched_op(): processed_batch_len = len(batch_buffer) self._process_batch(op, actor, batch_buffer, final_results, result_lock) @@ -579,7 +626,6 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo processed_count += processed_batch_len except queue.Empty: - # 超时时处理已有的batch数据 if batch_buffer: self._process_batch(op, actor, batch_buffer, final_results, result_lock) batch_buffer = [] @@ -593,47 +639,38 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo def transform_to_2d_format(self, data): """ - 将第二种格式的数据转换为第一种嵌套格式 - 根据 __dj__source_file__ 的唯一值来分组所有字段 + Transform data to 2D format for processing. """ - # print("data before trans", data) - if '__dj__source_file__' not in data: - if 'videos' not in data: - raise ValueError("数据中缺少 '__dj__source_file__' 字段且无法从 'videos' 字段推断") - # print(data) - data['__dj__source_file__'] = data['videos'] + if "__dj__source_file__" not in data: + if "videos" not in data: + raise ValueError("The '__dj__source_file__' field is missing in the data and cannot be inferred from the 'videos' field.") + data["__dj__source_file__"] = data["videos"] + + source_files = data["__dj__source_file__"] - - source_files = data['__dj__source_file__'] - - # 获取唯一的源文件并保持顺序 unique_sources = list(dict.fromkeys(source_files)) - # 为每个唯一源文件创建索引映射 source_to_indices = {} for source in unique_sources: source_to_indices[source] = [i for i, s in enumerate(source_files) if s == source] - # 初始化转换后的数据结构 transformed_data = {} - - # 遍历原数据的所有字段 for field_name, field_value in data.items(): - if field_name == '__dj__source_file__': - # 特殊处理 __dj__source_file__ 字段 + if field_name == "__dj__source_file__": + transformed_data[field_name] = [] for source in unique_sources: indices = source_to_indices[source] transformed_data[field_name].append([source] * len(indices)) elif isinstance(field_value, list): - # 处理列表类型的字段 + transformed_data[field_name] = [] for source in unique_sources: indices = source_to_indices[source] group_data = [field_value[i] for i in indices] transformed_data[field_name].append(group_data) elif isinstance(field_value, dict): - # 处理字典类型的字段 + transformed_data[field_name] = [] for source in unique_sources: indices = source_to_indices[source] @@ -642,24 +679,20 @@ def transform_to_2d_format(self, data): if isinstance(values, list): group_dict[key] = [values[i] for i in indices] else: - # 如果值不是列表,则重复该值 group_dict[key] = [values] * len(indices) transformed_data[field_name].append(group_dict) elif isinstance(field_value, str): - # 处理字符串类型的字段 transformed_data[field_name] = [] for source in unique_sources: indices = source_to_indices[source] - # 对于字符串,为每个组重复该字符串 transformed_data[field_name].append(field_value) else: - # 处理其他类型的字段 + transformed_data[field_name] = [] for source in unique_sources: indices = source_to_indices[source] - # 为每个组重复该值 transformed_data[field_name].append(field_value) - # print("data after trans", transformed_data) + return transformed_data diff --git a/data_juicer/core/executor/ray_executor.py b/data_juicer/core/executor/ray_executor.py index d011c933ff..b1e4ae2e3d 100644 --- a/data_juicer/core/executor/ray_executor.py +++ b/data_juicer/core/executor/ray_executor.py @@ -75,6 +75,7 @@ def __init__(self, cfg: Optional[Namespace] = None): keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds, **self.cfg.export_extra_args, ) + # Process data with parallel operators self.op_enable_parallel = True # self.op_enable_parallel = False diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 1412068425..5a9827a544 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -116,6 +116,10 @@ def __init__( ) def compute_stats_single_actor(self, sample, model, processor, rank=None, context=False): + """ + Compute aesthetics scores for a single sample in the actor. + With the model and processor loaded when the actor was created. + """ # check if it's computed already if StatsKeys.video_frames_aesthetics_score in sample[Fields.stats]: return sample diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index a71c7486e3..9c6c05d3d1 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -334,16 +334,8 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): def process_batched_actor(self, samples, model, processor, rank=None, context=False): """ - :param samples: - :return: - - Note: - This is a batched_OP, whose the input and output type are - both list. Suppose there are $N$ input sample list with batch - size as $b$, and denote caption_num as $M$. - the number of total samples after generation is $2Nb$ - for 'random_any' and 'similar_one' mode, - and $(1+M)Nb$ for 'all' mode. + Process a batch of samples in the actor. + With the model and processor loaded when the actor was created. """ # reconstruct samples from "dict of lists" to "list of dicts" reconstructed_samples = [] From 95ca9766c8bb66d18cccb7e90c2929520faac278 Mon Sep 17 00:00:00 2001 From: xcy Date: Wed, 20 Aug 2025 16:16:32 +0800 Subject: [PATCH 11/16] update for pre-commit --- data_juicer/core/RayOperatorWrapper.py | 1 - data_juicer/core/data/ray_dataset.py | 165 ++++++++++-------- data_juicer/ops/base_op.py | 13 +- .../ops/mapper/video_split_by_scene_mapper.py | 11 +- data_juicer/utils/file_utils.py | 32 +++- 5 files changed, 132 insertions(+), 90 deletions(-) diff --git a/data_juicer/core/RayOperatorWrapper.py b/data_juicer/core/RayOperatorWrapper.py index 7d8fd30d42..c0321c9983 100644 --- a/data_juicer/core/RayOperatorWrapper.py +++ b/data_juicer/core/RayOperatorWrapper.py @@ -1,4 +1,3 @@ -import pyarrow import ray @ray.remote(num_gpus=0.0) diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index 15c2cc0436..feefa01800 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections import defaultdict from datetime import datetime import os from functools import partial @@ -24,15 +23,8 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.process_utils import calculate_np from data_juicer.utils.webdataset_utils import _custom_default_decoder -import ray + ray = LazyLoader("ray") -from ray.util.placement_group import ( - placement_group, - placement_group_table, - remove_placement_group, -) -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from ray.data import from_items def get_abs_path(path, dataset_dir): if is_remote_path(path): @@ -207,7 +199,11 @@ def process_batch_arrow(table: pyarrow.Table): # Step 1: Create actors for all operators actors = {} for op in operators: - op_proc = 1 if op.use_cuda() else calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) + op_proc = ( + 1 + if op.use_cuda() + else calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) + ) # actor_num = min(op_proc, self.data.count()) actor_num = op_proc actors[op._name] = [] @@ -216,7 +212,7 @@ def process_batch_arrow(table: pyarrow.Table): actor = Actor.options( name=f"actor_{op._name}_{uuid.uuid4().hex[:4]}", num_gpus=op.gpu_required if op.use_cuda() else 0, - num_cpus=op.cpu_required + num_cpus=op.cpu_required, ).remote(op) if op.use_cuda(): @@ -233,18 +229,16 @@ def process_batch_arrow(table: pyarrow.Table): # Step 3: Process single operator streaming if len(operators) == 1: - return self._process_single_operator_streaming(operators[0], actors[operators[0]._name], batch_sizes[operators[0]._name]) + return self._process_single_operator_streaming( + operators[0], actors[operators[0]._name], batch_sizes[operators[0]._name] + ) # Step 4: Create queues for each actor and termination counters actor_queues = {} termination_counters = {} for op in operators: actor_queues[op._name] = [] - termination_counters[op._name] = { - 'count': 0, - 'lock': threading.Lock(), - 'total': len(actors[op._name]) - } + termination_counters[op._name] = {"count": 0, "lock": threading.Lock(), "total": len(actors[op._name])} for i, actor in enumerate(actors[op._name]): actor_queues[op._name].append(queue.Queue(maxsize=50)) @@ -257,10 +251,20 @@ def process_batch_arrow(table: pyarrow.Table): for i, actor in enumerate(actors[op._name]): thread = threading.Thread( target=self._process_actor_streaming, - args=(idx, op, actor, i, actor_queues, actors, operators, final_results, - result_lock, batch_sizes[op._name], termination_counters), + args=( + idx, + op, + actor, + i, + actor_queues, + operators, + final_results, + result_lock, + batch_sizes[op._name], + termination_counters, + ), name=f"actor_{op._name}_{i}", - daemon=True + daemon=True, ) thread.start() threads.append(thread) @@ -276,7 +280,7 @@ def data_distributor(): for batch in self.data.iter_batches(batch_size=1, batch_format="pyarrow"): for row_idx in range(len(batch)): row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} - row_data["_row_id"] = row_counter + row_data["_row_id"] = row_counter row_counter += 1 # distribute data to actors in a round-robin manner @@ -306,18 +310,18 @@ def data_distributor(): return self def _process_actor_streaming( - self, - op_idx, - op, - actor, - actor_id, - actor_queues, - operators, - final_results, - result_lock, - batch_size, - termination_counters, - ): + self, + op_idx, + op, + actor, + actor_id, + actor_queues, + operators, + final_results, + result_lock, + batch_size, + termination_counters, + ): """Process data for a single operator actor in a streaming manner.""" op_name = op._name input_queue = actor_queues[op_name][actor_id] @@ -339,8 +343,10 @@ def log_data_flow(row_id, action, start_time=None): if action == "start": logger.info(f"[DataFlow] Row {row_id} | {op_name}_actor_{actor_id} | START | {timestamp}") elif action == "end": - duration = (time.time() - start_time) - logger.info(f"[DataFlow] Row {row_id} | {op_name}_actor_{actor_id} | END | {timestamp} | Duration: {duration:.3f} s") + duration = time.time() - start_time + logger.info( + f"[DataFlow] Row {row_id} | {op_name}_actor_{actor_id} | END | {timestamp} | Duration: {duration:.3f} s" + ) def flush_buffer(current_next_actor_index): """ @@ -354,13 +360,19 @@ def flush_buffer(current_next_actor_index): items_in_buffer = len(batch_buffer) results_count = self._process_and_forward_batch( - op, actor, batch_buffer, next_op_queues, final_results, - result_lock, current_next_actor_index, log_data_flow + op, + actor, + batch_buffer, + next_op_queues, + final_results, + result_lock, + current_next_actor_index, + log_data_flow, ) # 核心修正:无论处理结果如何,处理过的项目数都应该增加 processed_count += items_in_buffer - batch_buffer.clear() # 清空缓冲区 + batch_buffer.clear() # 清空缓冲区 return results_count @@ -372,13 +384,13 @@ def flush_buffer(current_next_actor_index): if data_item is None: if batch_buffer: results_count = self._process_and_forward_batch( - op, - actor, - batch_buffer, - next_op_queues, - final_results, - result_lock, - next_actor_index, + op, + actor, + batch_buffer, + next_op_queues, + final_results, + result_lock, + next_actor_index, log_data_flow, ) next_actor_index += results_count @@ -400,12 +412,18 @@ def flush_buffer(current_next_actor_index): start_time = time.time() log_data_flow(row_id, "start", start_time) - batch_buffer.append((data_item, start_time, row_id)) + batch_buffer.append((data_item, start_time, row_id)) if len(batch_buffer) >= batch_size or not op.is_batched_op(): results_count = self._process_and_forward_batch( - op, actor, batch_buffer, next_op_queues, final_results, - result_lock, next_actor_index, log_data_flow + op, + actor, + batch_buffer, + next_op_queues, + final_results, + result_lock, + next_actor_index, + log_data_flow, ) next_actor_index += results_count processed_count += len(batch_buffer) @@ -414,13 +432,13 @@ def flush_buffer(current_next_actor_index): except queue.Empty: if batch_buffer: results_count = self._process_and_forward_batch( - op, - actor, - batch_buffer, - next_op_queues, - final_results, - result_lock, - next_actor_index, + op, + actor, + batch_buffer, + next_op_queues, + final_results, + result_lock, + next_actor_index, log_data_flow, ) next_actor_index += results_count @@ -466,8 +484,11 @@ def _process_batch(self, op, actor, batch_data, final_results, result_lock): def _submit_to_actor(self, op, actor, data_item): if isinstance(op, Mapper): if op.use_cuda(): - return actor.mapper_cuda_batched.remote(self.transform_to_2d_format(data_item)) if op.is_batched_op() else actor.mapper_cuda.remote(data_item) - # return actor.mapper_cuda_batched.remote(data_item) if op.is_batched_op() else actor.mapper_cuda.remote(data_item) + return ( + actor.mapper_cuda_batched.remote(self.transform_to_2d_format(data_item)) + if op.is_batched_op() + else actor.mapper_cuda.remote(data_item) + ) else: return actor.mapper_cpu.remote(data_item) @@ -486,14 +507,14 @@ def _submit_to_actor(self, op, actor, data_item): ) def _process_and_forward_batch( - self, - op, - actor, - batch_data_with_metadata, - next_op_queues, - final_results, - result_lock, - next_actor_index, + self, + op, + actor, + batch_data_with_metadata, + next_op_queues, + final_results, + result_lock, + next_actor_index, log_data_flow, ): """Process batch data and forward to downstream with data flow tracking""" @@ -528,7 +549,7 @@ def _process_and_forward_batch( elif results is not None: valid_results = [results] elif isinstance(op, Filter): - if results: + if results: if isinstance(results, list): valid_results = results else: @@ -543,8 +564,8 @@ def _process_and_forward_batch( try: # add row_id to result if needed if isinstance(result, dict): - result['_row_id'] = row_ids[i % len(row_ids)] - + result["_row_id"] = row_ids[i % len(row_ids)] + target_queue_idx = (next_actor_index + i) % len(next_op_queues) next_op_queues[target_queue_idx].put(result) except Exception as e: @@ -575,7 +596,7 @@ def _process_single_operator_streaming(self, op, op_actors, batch_size): thread = threading.Thread( target=self._process_single_actor, args=(op, actor, actor_queues[i], final_results, result_lock, batch_size), - daemon=True + daemon=True, ) thread.start() threads.append(thread) @@ -596,7 +617,7 @@ def _process_single_operator_streaming(self, op, op_actors, batch_size): thread.join() if final_results: - self.data = from_items(final_results) + self.data = ray.data.from_items(final_results) return self @@ -643,7 +664,9 @@ def transform_to_2d_format(self, data): """ if "__dj__source_file__" not in data: if "videos" not in data: - raise ValueError("The '__dj__source_file__' field is missing in the data and cannot be inferred from the 'videos' field.") + raise ValueError( + "The '__dj__source_file__' field is missing in the data and cannot be inferred from the 'videos' field." + ) data["__dj__source_file__"] = data["videos"] source_files = data["__dj__source_file__"] diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index c2a0b9dea2..9e3ecd411a 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -9,8 +9,7 @@ from data_juicer import is_cuda_available from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import SpecialTokens, size_to_bytes -from data_juicer.utils.model_utils import free_models -from data_juicer.utils.model_utils import get_model +from data_juicer.utils.model_utils import free_models, get_model from data_juicer.utils.process_utils import calculate_np from data_juicer.utils.registry import Registry @@ -23,7 +22,7 @@ import pytz from datetime import datetime -beijing_tz = pytz.timezone('Asia/Singapore') +beijing_tz = pytz.timezone("Asia/Singapore") def convert_list_dict_to_dict_list(samples): # reconstruct samples from "list of dicts" to "dict of lists" @@ -300,10 +299,10 @@ def load_model(self, rank=None): end = time.time() end_time = datetime.fromtimestamp(end, pytz.utc).astimezone(beijing_tz) print( - f"[Actor] {self._name} Model loaded in {end - start:.3f} seconds " - f"from {start_time.strftime('%Y-%m-%d %H:%M:%S')} " - f"to {end_time.strftime('%Y-%m-%d %H:%M:%S')}" - ) + f"[Actor] {self._name} Model loaded in {end - start:.3f} seconds " + f"from {start_time.strftime('%Y-%m-%d %H:%M:%S')} " + f"to {end_time.strftime('%Y-%m-%d %H:%M:%S')}" + ) return model, processor class Mapper(OP): diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index e11a4db5ab..a510b2a72f 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -1,7 +1,6 @@ import math import re from itertools import chain -import time # 导入time模块 from pydantic import NonNegativeFloat, NonNegativeInt @@ -50,6 +49,7 @@ def __init__( threshold: NonNegativeFloat = 27.0, min_scene_len: NonNegativeInt = 15, show_progress: bool = False, + save_dir: str = None, *args, **kwargs, ): @@ -61,22 +61,27 @@ def __init__( :param threshold: Threshold passed to the detector. :param min_scene_len: Minimum length of any scene. :param show_progress: Whether to show progress from scenedetect. + :param save_dir: The directory where generated video files will be stored. + If not specified, outputs will be saved in the same directory as their corresponding input files. + This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self._init_parameters = self.remove_extra_parameters(locals()) + self._init_parameters.pop("save_dir", None) if detector not in self.avaliable_detectors: raise ValueError( f"Scene detector {detector} is not supported. " - f"Can only be one of {list(self.avaliable_detectors.keys())}." + f"Can only be one of {list(self.avaliable_detectors.keys())}" ) self.detector = detector self.threshold = threshold self.min_scene_len = min_scene_len self.show_progress = show_progress + self.save_dir = save_dir # prepare detector args avaliable_kwargs = self.avaliable_detectors[self.detector] @@ -139,4 +144,4 @@ def process_single(self, sample, context=False): sample[Fields.source_file].extend([value] * len(output_video_keys[value])) sample[self.video_key] = list(chain.from_iterable([output_video_keys[key] for key in loaded_video_keys])) - return sample + return sample \ No newline at end of file diff --git a/data_juicer/utils/file_utils.py b/data_juicer/utils/file_utils.py index b1a6478d21..edb430189d 100644 --- a/data_juicer/utils/file_utils.py +++ b/data_juicer/utils/file_utils.py @@ -162,12 +162,19 @@ def transfer_data_dir(original_dir, op_name): return new_dir -def transfer_filename(original_filepath: Union[str, Path], op_name, **op_kwargs): +def transfer_filename(original_filepath: Union[str, Path], op_name, save_dir: str = None, **op_kwargs): """ According to the op and hashing its parameters 'op_kwargs' addition to the process id and current time as the 'hash_val', map the original_filepath to another unique file path. E.g. + When `save_dir` is provided: '/save_dir/path/to/data/' + /path/to/abc.jpg --> + /save_dir/path/to/data/abc__dj_hash_#{hash_val}#.jpg + When environment variable `DJ_PRODUCED_DATA_DIR` is provided: '/environment/path/to/data/' + /path/to/abc.jpg --> + /environment/path/to/data/{op_name}/abc__dj_hash_#{hash_val}#.jpg + When neither `save_dir` nor `DJ_PRODUCED_DATA_DIR` is provided: 1. abc.jpg --> __dj__produced_data__/{op_name}/ abc__dj_hash_#{hash_val}#.jpg @@ -182,16 +189,25 @@ def transfer_filename(original_filepath: Union[str, Path], op_name, **op_kwargs) /path/to/__dj__produced_data__/{op_name}/ abc__dj_hash_#{hash_val2}#.jpg + Priority: `save_dir` > `DJ_PRODUCED_DATA_DIR` > original data directory (default) """ # check if it's valid local path, if it's not, regard it as a remote path/url and return None if not os.path.exists(original_filepath): return original_filepath - # produce the directory - original_dir = os.path.dirname(original_filepath) - dir_token = f"/{Fields.multimodal_data_output_dir}/" - if dir_token in original_dir: - original_dir = original_dir.split(dir_token)[0] - new_dir = transfer_data_dir(original_dir, op_name) + + if save_dir: + new_dir = os.path.abspath(save_dir) + elif produced_data_dir := os.environ.get("DJ_PRODUCED_DATA_DIR", None): + new_dir = os.path.join(os.path.abspath(produced_data_dir), op_name) + else: + # produce the directory + original_dir = os.path.dirname(original_filepath) + dir_token = f"/{Fields.multimodal_data_output_dir}/" + if dir_token in original_dir: + original_dir = original_dir.split(dir_token)[0] + new_dir = transfer_data_dir(original_dir, op_name) + + create_directory_if_not_exists(new_dir) # produce the unique hash code unique_parameters = copy.deepcopy(op_kwargs) @@ -421,4 +437,4 @@ async def download_file( if return_content: return response, content - return response + return response \ No newline at end of file From 7aa2f79125bd6ab7b6dd0f394624294cc05bdadc Mon Sep 17 00:00:00 2001 From: xcy Date: Thu, 21 Aug 2025 14:17:50 +0800 Subject: [PATCH 12/16] update for pre-commit --- data_juicer/core/RayOperatorWrapper.py | 11 ++- data_juicer/core/data/dj_dataset.py | 2 +- data_juicer/core/data/ray_dataset.py | 99 +++++++++----------------- data_juicer/ops/base_op.py | 5 +- 4 files changed, 41 insertions(+), 76 deletions(-) diff --git a/data_juicer/core/RayOperatorWrapper.py b/data_juicer/core/RayOperatorWrapper.py index c0321c9983..eb34c06dcf 100644 --- a/data_juicer/core/RayOperatorWrapper.py +++ b/data_juicer/core/RayOperatorWrapper.py @@ -1,6 +1,6 @@ import ray -@ray.remote(num_gpus=0.0) +@ray.remote(num_gpus=0.0) class Actor: def __init__(self, op, rank=None): @@ -53,7 +53,7 @@ def filter_cuda_batched(self, data): self.load_model() data = self.op.compute_stats_batched(data, self.model, self.processor) # transform the map object to a list - keep_mask = list(self.op.process_batched(data)) + keep_mask = list(self.op.process_batched(data)) if not any(keep_mask): return None @@ -70,7 +70,6 @@ def filter_cuda_batched(self, data): return filtered_data - def filter_cpu_single(self, data): data = self.op.compute_stats_single(data) keep = self.op.process_single(data) @@ -82,7 +81,7 @@ def filter_cpu_single(self, data): def filter_cpu_batched(self, data): data = self.op.compute_stats_batched(data) # transform the map object to a list - keep_mask = list(self.op.process_batched(data)) + keep_mask = list(self.op.process_batched(data)) if not any(keep_mask): return None @@ -98,6 +97,4 @@ def filter_cpu_batched(self, data): else: raise ValueError("Unsupported data type for batch filtering") - return filtered_data - - + return filtered_data \ No newline at end of file diff --git a/data_juicer/core/data/dj_dataset.py b/data_juicer/core/data/dj_dataset.py index 9d5a6eda82..e182f247fe 100644 --- a/data_juicer/core/data/dj_dataset.py +++ b/data_juicer/core/data/dj_dataset.py @@ -36,7 +36,7 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) - """process a list of operators on the dataset.""" @abstractmethod - def process_parallel(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: + def process_parallel(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: """Implementing op parallel data processing based on Ray Actor""" @abstractmethod diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index feefa01800..51e905faf9 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -1,14 +1,13 @@ from __future__ import annotations -from datetime import datetime import os -from functools import partial import queue import threading import time -from typing import Any, Dict, List, Literal, Optional, Union import uuid -from data_juicer.core.RayOperatorWrapper import Actor +from datetime import datetime +from functools import partial +from typing import Any, Dict, List, Literal, Optional, Union import pyarrow from jsonargparse import Namespace from loguru import logger @@ -16,6 +15,7 @@ from data_juicer import cuda_device_count from data_juicer.core.data import DJDataset from data_juicer.core.data.schema import Schema +from data_juicer.core.RayOperatorWrapper import Actor from data_juicer.ops import Deduplicator, Filter, Mapper from data_juicer.ops.base_op import TAGGING_OPS from data_juicer.utils.constant import Fields @@ -96,8 +96,6 @@ class RayDataset(DJDataset): def __init__(self, dataset: ray.data.Dataset, dataset_path: str = None, cfg: Optional[Namespace] = None) -> None: self.data = preprocess_dataset(dataset, dataset_path, cfg) self.num_proc = getattr(cfg, "np", getattr(cfg, "num_proc", None)) if cfg else None - # self.gpu_pg = placement_group([{"CPU": 16, "GPU": 2}], strategy="STRICT_SPREAD") - # ray.get(self.gpu_pg.ready()) def schema(self) -> Schema: """Get dataset schema. @@ -252,15 +250,15 @@ def process_batch_arrow(table: pyarrow.Table): thread = threading.Thread( target=self._process_actor_streaming, args=( - idx, - op, - actor, - i, - actor_queues, - operators, - final_results, - result_lock, - batch_sizes[op._name], + idx, + op, + actor, + i, + actor_queues, + operators, + final_results, + result_lock, + batch_sizes[op._name], termination_counters, ), name=f"actor_{op._name}_{i}", @@ -310,23 +308,22 @@ def data_distributor(): return self def _process_actor_streaming( - self, + self, op_idx, - op, - actor, - actor_id, - actor_queues, - operators, - final_results, - result_lock, - batch_size, + op, + actor, + actor_id, + actor_queues, + operators, + final_results, + result_lock, + batch_size, termination_counters, ): """Process data for a single operator actor in a streaming manner.""" op_name = op._name input_queue = actor_queues[op_name][actor_id] - # 确定输出目标 next_op_queues = None if op_idx + 1 < len(operators): next_op_name = operators[op_idx + 1]._name @@ -348,38 +345,9 @@ def log_data_flow(row_id, action, start_time=None): f"[DataFlow] Row {row_id} | {op_name}_actor_{actor_id} | END | {timestamp} | Duration: {duration:.3f} s" ) - def flush_buffer(current_next_actor_index): - """ - 辅助函数,用于处理并清空缓冲区。 - 它会更新外部的 processed_count 并返回成功转发的结果数量。 - """ - if not batch_buffer: - return 0 - - nonlocal processed_count - items_in_buffer = len(batch_buffer) - - results_count = self._process_and_forward_batch( - op, - actor, - batch_buffer, - next_op_queues, - final_results, - result_lock, - current_next_actor_index, - log_data_flow, - ) - - # 核心修正:无论处理结果如何,处理过的项目数都应该增加 - processed_count += items_in_buffer - batch_buffer.clear() # 清空缓冲区 - - return results_count - while True: try: - # 将超时时间设置得短一些,可以更频繁地处理因超时而触发的批次 - data_item = input_queue.get(timeout=5.0) + data_item = input_queue.get(timeout=5.0) if data_item is None: if batch_buffer: @@ -507,16 +475,16 @@ def _submit_to_actor(self, op, actor, data_item): ) def _process_and_forward_batch( - self, - op, - actor, - batch_data_with_metadata, - next_op_queues, - final_results, - result_lock, - next_actor_index, - log_data_flow, - ): + self, + op, + actor, + batch_data_with_metadata, + next_op_queues, + final_results, + result_lock, + next_actor_index, + log_data_flow, + ): """Process batch data and forward to downstream with data flow tracking""" if not batch_data_with_metadata: return 0 @@ -654,7 +622,6 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo except Exception as e: logger.error(f"Error in single actor processing: {e}") break - logger.info(f"Single actor completed, processed {processed_count} items") diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 187b2e1c76..782cff1449 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,6 +1,6 @@ import copy -from functools import wraps import time +from functools import wraps import numpy as np import pyarrow as pa @@ -20,9 +20,10 @@ TAGGING_OPS = Registry("Tagging Operators") ATTRIBUTION_FILTERS = Registry("Attribution Filters") +from datetime import datetime import pytz -from datetime import datetime + beijing_tz = pytz.timezone("Asia/Singapore") def convert_list_dict_to_dict_list(samples): From bb62e90979584bdfc295784fbcefc1067b272da6 Mon Sep 17 00:00:00 2001 From: xcy Date: Thu, 21 Aug 2025 14:22:44 +0800 Subject: [PATCH 13/16] rename ray_actor.py --- .gitignore | 5 ++ data_juicer/core/ray_actor.py | 120 ---------------------------------- 2 files changed, 5 insertions(+), 120 deletions(-) delete mode 100644 data_juicer/core/ray_actor.py diff --git a/.gitignore b/.gitignore index 10b9e7ecde..a0038b36c7 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,8 @@ tests/ops/data/*dup* tests/tools/tmp_*/ tests/ops/deduplicator/chinese_dedup/ tests/ops/deduplicator/english_dedup/ +demos/process_video_on_ray/data/videos/ +log.txt +mps_test_results/ +spatial-sharing-sys/ +test.py diff --git a/data_juicer/core/ray_actor.py b/data_juicer/core/ray_actor.py deleted file mode 100644 index 9eec2dd797..0000000000 --- a/data_juicer/core/ray_actor.py +++ /dev/null @@ -1,120 +0,0 @@ -from functools import partial -import ray -import pyarrow - -from data_juicer.ops.base_op import Filter, Mapper -from loguru import logger - - - -def filter_batch(batch, filter_func): - mask = pyarrow.array(filter_func(batch.to_pydict())) - return batch.filter(mask) - -@ray.remote(num_gpus=0.0) -class Actor: - def __init__(self, op, rank=None): - - self.op = op - self._model_loaded = False # 标记模型是否已加载 - self.rank = rank - self.model = None - self.processor = None - - def load_model(self): - - if self.op.use_cuda() and not self._model_loaded: - - self.model, self.processor = self.op.load_model(rank=self.rank) - self._model_loaded = True - - def mapper_cuda(self, data): - if not self._model_loaded: - self.load_model() # 确保调用前模型已加载 - data = self.op.process_single(data, self.model, self.processor) - return data - - def mapper_cuda_batched(self, data): - if not self._model_loaded: - self.load_model() # 确保调用前模型已加载 - data = self.op.process_batched_actor(data, self.model, self.processor) - return data - - def mapper_cpu(self, data): - # 处理数据 - processed_data = self.op.process_single(data) - return processed_data - - def filter_cuda_single(self, data): - if not self._model_loaded: - self.load_model() - data = self.op.compute_stats_single_actor(data, self.model, self.processor) - keep = self.op.process_single(data) - - if keep: - return data - else: - return None - - def filter_cuda_batched(self, data): - if not self._model_loaded: - self.load_model() - data = self.op.compute_stats_batched(data, self.model, self.processor) - - keep_mask = list(self.op.process_batched(data)) # 将map对象转换为列表 - - # 如果没有数据需要保留,返回None - if not any(keep_mask): - return None - - # 根据掩码过滤数据 - if isinstance(data, dict): - # 如果data是字典(假设每个key对应一个列表) - filtered_data = { - key: [value for value, keep in zip(values, keep_mask) if keep] - for key, values in data.items() - } - elif isinstance(data, list): - # 如果data是列表 - filtered_data = [item for item, keep in zip(data, keep_mask) if keep] - else: - # 其他情况(如Ray Dataset的批处理) - raise ValueError("Unsupported data type for batch filtering") - - return filtered_data - - - def filter_cpu_single(self, data): - data = self.op.compute_stats_single(data) - keep = self.op.process_single(data) - if keep: - return data - else: - return None - - def filter_cpu_batched(self, data): - # data = self.op.compute_stats_batched(data, self.model, self.processor) - data = self.op.compute_stats_batched(data) - keep_mask = list(self.op.process_batched(data)) # 将map对象转换为列表 - - # 如果没有数据需要保留,返回None - if not any(keep_mask): - return None - - # 根据掩码过滤数据 - if isinstance(data, dict): - # 如果data是字典(假设每个key对应一个列表) - filtered_data = { - key: [value for value, keep in zip(values, keep_mask) if keep] - for key, values in data.items() - } - elif isinstance(data, list): - # 如果data是列表 - filtered_data = [item for item, keep in zip(data, keep_mask) if keep] - else: - # 其他情况(如Ray Dataset的批处理) - raise ValueError("Unsupported data type for batch filtering") - - return filtered_data - - From a978375a602e1338cf2ff3355b2c5be617707aca Mon Sep 17 00:00:00 2001 From: xcy Date: Thu, 21 Aug 2025 16:35:44 +0800 Subject: [PATCH 14/16] update for pre-commit --- data_juicer/core/RayOperatorWrapper.py | 2 -- data_juicer/ops/base_op.py | 8 ++------ .../ops/mapper/video_captioning_from_frames_mapper.py | 11 ++++------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/data_juicer/core/RayOperatorWrapper.py b/data_juicer/core/RayOperatorWrapper.py index eb34c06dcf..8b7d21f249 100644 --- a/data_juicer/core/RayOperatorWrapper.py +++ b/data_juicer/core/RayOperatorWrapper.py @@ -92,9 +92,7 @@ def filter_cpu_batched(self, data): key: [value for value, keep in zip(values, keep_mask) if keep] for key, values in data.items() } elif isinstance(data, list): - filtered_data = [item for item, keep in zip(data, keep_mask) if keep] else: raise ValueError("Unsupported data type for batch filtering") - return filtered_data \ No newline at end of file diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 782cff1449..4d33c28d83 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,16 +1,16 @@ import copy import time from functools import wraps +from datetime import datetime import numpy as np import pyarrow as pa +import pytz from data_juicer import is_cuda_available from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import SpecialTokens, size_to_bytes - from data_juicer.utils.model_utils import free_models, get_model - from data_juicer.utils.process_utils import calculate_np from data_juicer.utils.registry import Registry @@ -20,10 +20,6 @@ TAGGING_OPS = Registry("Tagging Operators") ATTRIBUTION_FILTERS = Registry("Attribution Filters") -from datetime import datetime - -import pytz - beijing_tz = pytz.timezone("Asia/Singapore") def convert_list_dict_to_dict_list(samples): diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index 9c6c05d3d1..b6e9f51c38 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -331,7 +331,6 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): generated_text_candidates_single_chunk[max_index]) return generated_text_per_chunk - def process_batched_actor(self, samples, model, processor, rank=None, context=False): """ Process a batch of samples in the actor. @@ -349,10 +348,10 @@ def process_batched_actor(self, samples, model, processor, rank=None, context=Fa if self.keep_original_sample: samples_after_generation.append(ori_sample) generated_samples = self._process_single_sample_actor(ori_sample, - model, - processor, - rank=rank, - context=context) + model, + processor, + rank=rank, + context=context) if len(generated_samples) != 0: samples_after_generation.extend(generated_samples) # reconstruct samples from "list of dicts" to "dict of lists" @@ -363,9 +362,7 @@ def process_batched_actor(self, samples, model, processor, rank=None, context=Fa return res_samples - def _process_single_sample(self, ori_sample, rank=None, context=False): - # there is no videos in this sample if self.video_key not in ori_sample or not ori_sample[self.video_key]: return [] From 042a0b472f8859d68c7c598da776f457ac837a6e Mon Sep 17 00:00:00 2001 From: xcy Date: Thu, 21 Aug 2025 19:25:03 +0800 Subject: [PATCH 15/16] Fix trailing whitespace and pre-commit issues --- data_juicer/core/RayOperatorWrapper.py | 21 ++--- data_juicer/core/data/ray_dataset.py | 83 ++++++++++--------- data_juicer/core/executor/ray_executor.py | 4 +- data_juicer/ops/base_op.py | 8 +- .../ops/filter/video_aesthetics_filter.py | 2 + .../ops/filter/video_watermark_filter.py | 3 +- .../video_captioning_from_frames_mapper.py | 2 +- .../ops/mapper/video_split_by_scene_mapper.py | 2 +- data_juicer/utils/file_utils.py | 2 +- 9 files changed, 67 insertions(+), 60 deletions(-) diff --git a/data_juicer/core/RayOperatorWrapper.py b/data_juicer/core/RayOperatorWrapper.py index 8b7d21f249..09707ef10f 100644 --- a/data_juicer/core/RayOperatorWrapper.py +++ b/data_juicer/core/RayOperatorWrapper.py @@ -1,5 +1,6 @@ import ray + @ray.remote(num_gpus=0.0) class Actor: def __init__(self, op, rank=None): @@ -9,11 +10,11 @@ def __init__(self, op, rank=None): self.rank = rank self.model = None self.processor = None - + def load_model(self): if self.op.use_cuda() and not self._model_loaded: - + self.model, self.processor = self.op.load_model(rank=self.rank) self._model_loaded = True @@ -23,7 +24,7 @@ def mapper_cuda(self, data): # process data data = self.op.process_single_actor(data, self.model, self.processor) return data - + def mapper_cuda_batched(self, data): if not self._model_loaded: self.load_model() # ensure model is loaded before processing @@ -35,7 +36,7 @@ def mapper_cpu(self, data): # process data processed_data = self.op.process_single(data) return processed_data - + def filter_cuda_single(self, data): if not self._model_loaded: self.load_model() @@ -47,17 +48,17 @@ def filter_cuda_single(self, data): return data else: return None - + def filter_cuda_batched(self, data): if not self._model_loaded: self.load_model() data = self.op.compute_stats_batched(data, self.model, self.processor) # transform the map object to a list keep_mask = list(self.op.process_batched(data)) - + if not any(keep_mask): return None - + # filter data based on the keep_mask if isinstance(data, dict): filtered_data = { @@ -67,7 +68,7 @@ def filter_cuda_batched(self, data): filtered_data = [item for item, keep in zip(data, keep_mask) if keep] else: raise ValueError("Unsupported data type for batch filtering") - + return filtered_data def filter_cpu_single(self, data): @@ -77,7 +78,7 @@ def filter_cpu_single(self, data): return data else: return None - + def filter_cpu_batched(self, data): data = self.op.compute_stats_batched(data) # transform the map object to a list @@ -95,4 +96,4 @@ def filter_cpu_batched(self, data): filtered_data = [item for item, keep in zip(data, keep_mask) if keep] else: raise ValueError("Unsupported data type for batch filtering") - return filtered_data \ No newline at end of file + return filtered_data diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index 51e905faf9..4d99028f35 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -8,6 +8,7 @@ from datetime import datetime from functools import partial from typing import Any, Dict, List, Literal, Optional, Union + import pyarrow from jsonargparse import Namespace from loguru import logger @@ -26,6 +27,7 @@ ray = LazyLoader("ray") + def get_abs_path(path, dataset_dir): if is_remote_path(path): return path @@ -155,8 +157,7 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) - self._run_single_op(op) self.data = self.data.materialize() return self - - + def process_parallel(self, operators, *, exporter=None, checkpointer=None, tracer=None) -> DJDataset: """ Process the dataset in parallel using multiple operators. @@ -179,6 +180,7 @@ def process_parallel(self, operators, *, exporter=None, checkpointer=None, trace add_stats = True if add_meta: + def process_batch_arrow(table: pyarrow.Table): new_column_data = [{} for _ in range(len(table))] new_table = table.append_column(Fields.meta, [new_column_data]) @@ -187,6 +189,7 @@ def process_batch_arrow(table: pyarrow.Table): self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") if add_stats: + def process_batch_arrow(table: pyarrow.Table): new_column_data = [{} for _ in range(len(table))] new_table = table.append_column(Fields.stats, [new_column_data]) @@ -273,19 +276,19 @@ def data_distributor(): first_op_queues = actor_queues[first_op._name] actor_index = 0 row_counter = 0 # Initialize row counter - + try: for batch in self.data.iter_batches(batch_size=1, batch_format="pyarrow"): for row_idx in range(len(batch)): row_data = {col: batch[col][row_idx].as_py() for col in batch.column_names} row_data["_row_id"] = row_counter row_counter += 1 - + # distribute data to actors in a round-robin manner target_queue = first_op_queues[actor_index % len(first_op_queues)] target_queue.put(row_data) actor_index += 1 - + except Exception as e: logger.error(f"Error in data distributor: {e}") finally: @@ -306,7 +309,7 @@ def data_distributor(): self.data = ray.data.from_items(final_results) return self - + def _process_actor_streaming( self, op_idx, @@ -323,17 +326,17 @@ def _process_actor_streaming( """Process data for a single operator actor in a streaming manner.""" op_name = op._name input_queue = actor_queues[op_name][actor_id] - + next_op_queues = None if op_idx + 1 < len(operators): next_op_name = operators[op_idx + 1]._name next_op_queues = actor_queues[next_op_name] - + logger.info(f"Starting streaming processor for {op_name} actor {actor_id}") processed_count = 0 batch_buffer = [] next_actor_index = 0 - + # data flow logging function def log_data_flow(row_id, action, start_time=None): timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] @@ -348,7 +351,7 @@ def log_data_flow(row_id, action, start_time=None): while True: try: data_item = input_queue.get(timeout=5.0) - + if data_item is None: if batch_buffer: results_count = self._process_and_forward_batch( @@ -362,7 +365,7 @@ def log_data_flow(row_id, action, start_time=None): log_data_flow, ) next_actor_index += results_count - + # update termination counter with termination_counters[op_name]["lock"]: termination_counters[op_name]["count"] += 1 @@ -373,13 +376,13 @@ def log_data_flow(row_id, action, start_time=None): if current_count >= total_actors and next_op_queues: for q in next_op_queues: q.put(None) - + break - + row_id = data_item.get("_row_id", "unknown") start_time = time.time() log_data_flow(row_id, "start", start_time) - + batch_buffer.append((data_item, start_time, row_id)) if len(batch_buffer) >= batch_size or not op.is_batched_op(): @@ -396,7 +399,7 @@ def log_data_flow(row_id, action, start_time=None): next_actor_index += results_count processed_count += len(batch_buffer) batch_buffer = [] - + except queue.Empty: if batch_buffer: results_count = self._process_and_forward_batch( @@ -416,14 +419,14 @@ def log_data_flow(row_id, action, start_time=None): except Exception as e: logger.error(f"Error in {op_name} actor {actor_id}: {e}") break - + logger.info(f"Streaming processor for {op_name} actor {actor_id} completed, processed {processed_count} items") def _process_batch(self, op, actor, batch_data, final_results, result_lock): """Process a batch of data with the given operator and actor.""" if not batch_data: return - + try: if len(batch_data) == 1: future = self._submit_to_actor(op, actor, batch_data[0]) @@ -445,7 +448,7 @@ def _process_batch(self, op, actor, batch_data, final_results, result_lock): final_results.extend(results) elif results is not None: final_results.append(results) - + except Exception as e: logger.error(f"Error processing batch: {e}") @@ -488,12 +491,12 @@ def _process_and_forward_batch( """Process batch data and forward to downstream with data flow tracking""" if not batch_data_with_metadata: return 0 - + # separate the data, start time, and line number batch_data = [item[0] for item in batch_data_with_metadata] start_times = [item[1] for item in batch_data_with_metadata] row_ids = [item[2] for item in batch_data_with_metadata] - + try: if len(batch_data) == 1: future = self._submit_to_actor(op, actor, batch_data[0]) @@ -509,7 +512,7 @@ def _process_and_forward_batch( elif result is not None: flattened_results.append(result) results = flattened_results - + valid_results = [] if isinstance(op, Mapper): if isinstance(results, list): @@ -522,7 +525,7 @@ def _process_and_forward_batch( valid_results = results else: valid_results = [results] - + for row_id, start_time in zip(row_ids, start_times): log_data_flow(row_id, "end", start_time) @@ -541,20 +544,20 @@ def _process_and_forward_batch( elif not next_op_queues and valid_results: with result_lock: final_results.extend(valid_results) - + return len(valid_results) - + except Exception as e: for row_id, start_time in zip(row_ids, start_times): log_data_flow(row_id, "end", start_time) logger.error(f"Error processing and forwarding batch: {e}") return 0 - + def _process_single_operator_streaming(self, op, op_actors, batch_size): """Stream processing for a single operator.""" final_results = [] result_lock = threading.Lock() - + # create an independent queue for each actor. actor_queues = [queue.Queue(maxsize=50) for _ in op_actors] @@ -568,7 +571,7 @@ def _process_single_operator_streaming(self, op, op_actors, batch_size): ) thread.start() threads.append(thread) - + actor_index = 0 for batch in self.data.iter_batches(batch_size=1, batch_format="pyarrow"): for row_idx in range(len(batch)): @@ -576,19 +579,19 @@ def _process_single_operator_streaming(self, op, op_actors, batch_size): target_queue = actor_queues[actor_index % len(actor_queues)] target_queue.put(row_data) actor_index += 1 - + # notify all Actors to terminate. for actor_queue in actor_queues: actor_queue.put(None) - + for thread in threads: thread.join() - + if final_results: self.data = ray.data.from_items(final_results) - + return self - + def _process_single_actor(self, op, actor, input_queue, final_results, result_lock, batch_size): """ Process data for a single actor in a streaming manner. @@ -596,7 +599,7 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo """ batch_buffer = [] processed_count = 0 - + while True: try: data_item = input_queue.get(timeout=30.0) @@ -604,16 +607,16 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo if batch_buffer: self._process_batch(op, actor, batch_buffer, final_results, result_lock) break - + batch_buffer.append(data_item) - + # process when the batch is full or during batch processing operations. if len(batch_buffer) >= batch_size or not op.is_batched_op(): processed_batch_len = len(batch_buffer) self._process_batch(op, actor, batch_buffer, final_results, result_lock) batch_buffer = [] processed_count += processed_batch_len - + except queue.Empty: if batch_buffer: self._process_batch(op, actor, batch_buffer, final_results, result_lock) @@ -624,7 +627,6 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo break logger.info(f"Single actor completed, processed {processed_count} items") - def transform_to_2d_format(self, data): """ Transform data to 2D format for processing. @@ -639,11 +641,11 @@ def transform_to_2d_format(self, data): source_files = data["__dj__source_file__"] unique_sources = list(dict.fromkeys(source_files)) - + source_to_indices = {} for source in unique_sources: source_to_indices[source] = [i for i, s in enumerate(source_files) if s == source] - + transformed_data = {} for field_name, field_value in data.items(): if field_name == "__dj__source_file__": @@ -677,7 +679,7 @@ def transform_to_2d_format(self, data): indices = source_to_indices[source] transformed_data[field_name].append(field_value) else: - + transformed_data[field_name] = [] for source in unique_sources: indices = source_to_indices[source] @@ -685,7 +687,6 @@ def transform_to_2d_format(self, data): return transformed_data - def _run_single_op(self, op): op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) num_gpus = get_num_gpus(op, op_proc) diff --git a/data_juicer/core/executor/ray_executor.py b/data_juicer/core/executor/ray_executor.py index 0c6e30d028..51f26a5e1f 100644 --- a/data_juicer/core/executor/ray_executor.py +++ b/data_juicer/core/executor/ray_executor.py @@ -57,9 +57,9 @@ def __init__(self, cfg: Optional[Namespace] = None): # init ray logger.info("Initializing Ray ...") - + ray.init(self.cfg.ray_address, ignore_reinit_error=True) - + self.tmp_dir = os.path.join(self.work_dir, ".tmp", ray.get_runtime_context().get_job_id()) # absolute path resolution logic diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 4d33c28d83..e337c938fa 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,7 +1,7 @@ import copy import time -from functools import wraps from datetime import datetime +from functools import wraps import numpy as np import pyarrow as pa @@ -22,6 +22,7 @@ beijing_tz = pytz.timezone("Asia/Singapore") + def convert_list_dict_to_dict_list(samples): # reconstruct samples from "list of dicts" to "dict of lists" keys = samples[0].keys() @@ -298,7 +299,7 @@ def empty_history(self): return np.empty((0, 0), dtype=str) def load_model(self, rank=None): - + start = time.time() start_time = datetime.fromtimestamp(start, pytz.utc).astimezone(beijing_tz) model, processor = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda()) @@ -310,7 +311,8 @@ def load_model(self, rank=None): f"to {end_time.strftime('%Y-%m-%d %H:%M:%S')}" ) return model, processor - + + class Mapper(OP): def __init__(self, *args, **kwargs): """ diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 5a9827a544..89b0c4ce7e 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -20,6 +20,7 @@ OP_NAME = "video_aesthetics_filter" + @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) @@ -185,6 +186,7 @@ def compute_stats_single_actor(self, sample, model, processor, rank=None, contex close_video(videos[vid_key]) return sample + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_frames_aesthetics_score in sample[Fields.stats]: diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 4237757562..6f1f96849b 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -19,6 +19,7 @@ OP_NAME = "video_watermark_filter" + @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) @@ -165,7 +166,7 @@ def compute_stats_single_actor(self, sample, model, processor, rank=None, contex close_video(videos[vid_key]) return sample - + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_watermark_prob in sample[Fields.stats]: diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index b6e9f51c38..5b70b7f6ec 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -361,7 +361,7 @@ def process_batched_actor(self, samples, model, processor, rank=None, context=Fa res_samples[key] = [s[key] for s in samples_after_generation] return res_samples - + def _process_single_sample(self, ori_sample, rank=None, context=False): # there is no videos in this sample if self.video_key not in ori_sample or not ori_sample[self.video_key]: diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index a510b2a72f..58d78ebad1 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -144,4 +144,4 @@ def process_single(self, sample, context=False): sample[Fields.source_file].extend([value] * len(output_video_keys[value])) sample[self.video_key] = list(chain.from_iterable([output_video_keys[key] for key in loaded_video_keys])) - return sample \ No newline at end of file + return sample diff --git a/data_juicer/utils/file_utils.py b/data_juicer/utils/file_utils.py index edb430189d..385ecbf60f 100644 --- a/data_juicer/utils/file_utils.py +++ b/data_juicer/utils/file_utils.py @@ -437,4 +437,4 @@ async def download_file( if return_content: return response, content - return response \ No newline at end of file + return response From 6875b7d87f085f93716fc60b4f0460ab239ee09a Mon Sep 17 00:00:00 2001 From: xcy Date: Mon, 25 Aug 2025 15:00:39 +0800 Subject: [PATCH 16/16] Fix unittest error --- data_juicer/core/RayOperatorWrapper.py | 18 ++++++++++++++---- data_juicer/core/data/dj_dataset.py | 3 +++ data_juicer/core/data/ray_dataset.py | 19 +++++++++++++++---- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/data_juicer/core/RayOperatorWrapper.py b/data_juicer/core/RayOperatorWrapper.py index 09707ef10f..1bc3745a78 100644 --- a/data_juicer/core/RayOperatorWrapper.py +++ b/data_juicer/core/RayOperatorWrapper.py @@ -72,6 +72,10 @@ def filter_cuda_batched(self, data): return filtered_data def filter_cpu_single(self, data): + if "text" in data and isinstance(data["text"], list) and len(data["text"]) == 1: + data["text"] = data["text"][0] + if "__dj__stats__" in data and isinstance(data["__dj__stats__"], list) and len(data["__dj__stats__"]) == 1: + data["__dj__stats__"] = data["__dj__stats__"][0] data = self.op.compute_stats_single(data) keep = self.op.process_single(data) if keep: @@ -81,7 +85,7 @@ def filter_cpu_single(self, data): def filter_cpu_batched(self, data): data = self.op.compute_stats_batched(data) - # transform the map object to a list + keep_mask = list(self.op.process_batched(data)) if not any(keep_mask): @@ -89,11 +93,17 @@ def filter_cpu_batched(self, data): # filter data based on the keep_mask if isinstance(data, dict): - filtered_data = { - key: [value for value, keep in zip(values, keep_mask) if keep] for key, values in data.items() - } + filtered_data = {} + for key, values in data.items(): + if key in ["text", "__dj__stats__"]: + # 对这些字段应用过滤 + filtered_data[key] = [value for value, keep in zip(values, keep_mask) if keep] + else: + # 对其他字段保持原样 + filtered_data[key] = values elif isinstance(data, list): filtered_data = [item for item, keep in zip(data, keep_mask) if keep] else: raise ValueError("Unsupported data type for batch filtering") + return filtered_data diff --git a/data_juicer/core/data/dj_dataset.py b/data_juicer/core/data/dj_dataset.py index e182f247fe..92ddc78ce7 100644 --- a/data_juicer/core/data/dj_dataset.py +++ b/data_juicer/core/data/dj_dataset.py @@ -348,6 +348,9 @@ def process( logger.error("Error occurred when making log summarization") return dataset + def process_parallel(self, *args, **kwargs): + raise NotImplementedError("The process_parallel method needs to be implemented for the NestedDataset class.") + def update_args(self, args, kargs, is_filter=False): if args: args = list(args) diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py index 4d99028f35..50c22d1480 100644 --- a/data_juicer/core/data/ray_dataset.py +++ b/data_juicer/core/data/ray_dataset.py @@ -184,6 +184,7 @@ def process_parallel(self, operators, *, exporter=None, checkpointer=None, trace def process_batch_arrow(table: pyarrow.Table): new_column_data = [{} for _ in range(len(table))] new_table = table.append_column(Fields.meta, [new_column_data]) + print("new_table:", new_table) return new_table self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") @@ -193,6 +194,7 @@ def process_batch_arrow(table: pyarrow.Table): def process_batch_arrow(table: pyarrow.Table): new_column_data = [{} for _ in range(len(table))] new_table = table.append_column(Fields.stats, [new_column_data]) + print("new_table:", new_table) return new_table self.data = self.data.map_batches(process_batch_arrow, batch_format="pyarrow") @@ -205,7 +207,6 @@ def process_batch_arrow(table: pyarrow.Table): if op.use_cuda() else calculate_np(op._name, op.mem_required, op.cpu_required, self.num_proc, op.use_cuda()) ) - # actor_num = min(op_proc, self.data.count()) actor_num = op_proc actors[op._name] = [] @@ -304,8 +305,8 @@ def data_distributor(): # wait for all processing threads to finish for thread in threads: thread.join() - if final_results: + self.data = ray.data.from_items(final_results) return self @@ -453,6 +454,7 @@ def _process_batch(self, op, actor, batch_data, final_results, result_lock): logger.error(f"Error processing batch: {e}") def _submit_to_actor(self, op, actor, data_item): + """Submit a single data item to the actor for processing.""" if isinstance(op, Mapper): if op.use_cuda(): return ( @@ -472,7 +474,7 @@ def _submit_to_actor(self, op, actor, data_item): ) else: return ( - actor.filter_cpu_batched.remote(data_item) + actor.filter_cpu_batched.remote(self.transform((data_item))) if op.is_batched_op() else actor.filter_cpu_single.remote(data_item) ) @@ -491,7 +493,6 @@ def _process_and_forward_batch( """Process batch data and forward to downstream with data flow tracking""" if not batch_data_with_metadata: return 0 - # separate the data, start time, and line number batch_data = [item[0] for item in batch_data_with_metadata] start_times = [item[1] for item in batch_data_with_metadata] @@ -627,6 +628,16 @@ def _process_single_actor(self, op, actor, input_queue, final_results, result_lo break logger.info(f"Single actor completed, processed {processed_count} items") + def transform(self, data): + + if not isinstance(data.get("text"), list): + data["text"] = [data["text"]] + + if not isinstance(data.get("__dj__stats__"), list): + data["__dj__stats__"] = [data["__dj__stats__"]] + + return data + def transform_to_2d_format(self, data): """ Transform data to 2D format for processing.