datajuicer · cyruszhang · Jul 16, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/README.md b/README.md
@@ -124,6 +124,7 @@ Besides, our paper is also updated to [v3](https://arxiv.org/abs/2309.02033).
   - [How-to Guide for Developers](docs/DeveloperGuide.md)
   - [Distributed Data Processing in Data-Juicer](docs/Distributed.md)
   - [Sandbox](docs/Sandbox.md)
+  - [Job Management & Monitoring](docs/JobManagement.md)
   - [Data-Juicer Agent](docs/DJ_agent.md)
 - Demos
   - [demos](demos/README.md)
@@ -141,6 +142,10 @@ Besides, our paper is also updated to [v3](https://arxiv.org/abs/2309.02033).
   - [Postprocess Tools](tools/postprocess/README.md)
   - [Preprocess Tools](tools/preprocess/README.md)
   - [Data Scoring](tools/quality_classifier/README.md)
+- Job Management & Monitoring
+  - [Processing Snapshot Utility](data_juicer/utils/job/snapshot.py) - Comprehensive job status analysis with JSON output
+  - [Job Management Tools](data_juicer/utils/job/) - Monitor and manage Data-Juicer processing jobs
+  - [Resource-Aware Partitioning](data_juicer/core/executor/partition_size_optimizer.py) - Automatic resource optimization for distributed processing
 - Third-party
   - [LLM Ecosystems](thirdparty/LLM_ecosystems/README.md)
   - [Third-party Model Library](thirdparty/models/README.md)

diff --git a/README_ZH.md b/README_ZH.md
@@ -117,6 +117,7 @@ Data-Juicer 现采用 AI 自动重写和优化算子的 docstring，并生成详
   - [开发者指南](docs/DeveloperGuide_ZH.md)
   - [Data-Juicer分布式数据处理](docs/Distributed_ZH.md)
   - [沙盒实验室](docs/Sandbox_ZH.md)
+  - [作业管理与监控](docs/JobManagement_ZH.md)
   - [Data-Juicer Agent](docs/DJ_agent_ZH.md)
 - Demos
   - [演示](demos/README_ZH.md)
@@ -136,6 +137,10 @@ Data-Juicer 现采用 AI 自动重写和优化算子的 docstring，并生成详
   - [后处理工具](tools/postprocess/README_ZH.md)
   - [预处理工具](tools/preprocess/README_ZH.md)
   - [给数据打分](tools/quality_classifier/README_ZH.md)
+- 作业管理与监控
+  - [处理快照工具](data_juicer/utils/job/snapshot.py) - 提供JSON格式的全面作业状态分析
+  - [作业管理工具](data_juicer/utils/job/) - 监控和管理Data-Juicer处理作业
+  - [资源感知分区](data_juicer/core/executor/partition_size_optimizer.py) - 分布式处理的自动资源优化
 - 第三方
   - [大语言模型生态](thirdparty/LLM_ecosystems/README_ZH.md)
   - [第三方模型库](thirdparty/models/README_ZH.md)

diff --git a/configs/demo/partition-checkpoint-eventlog-control.yaml b/configs/demo/partition-checkpoint-eventlog-control.yaml
@@ -0,0 +1,89 @@
+# =============================================================================
+# CONTROL CONFIG FOR partition-checkpoint-eventlog.yaml
+# =============================================================================
+# This is a control configuration file for partition-checkpoint-eventlog.yaml
+# that uses the non-partitioned Ray executor (executor_type: "ray") instead of
+# the partitioned executor (executor_type: "ray_partitioned").
+#
+# This config is useful for:
+# 1. Comparing performance between partitioned and non-partitioned executors
+# 2. Testing DAG execution without partitioning
+# 3. Simpler execution flow without partition management
+#
+# Key differences from partition-checkpoint-eventlog.yaml:
+# - executor_type: "ray" (instead of "ray_partitioned")
+# - No partition configuration needed
+# - Simpler execution model (no partition splitting/merging)
+# =============================================================================
+
+dataset_path: './demos/data/demo-dataset.jsonl'
+
+work_dir: "./outputs/partition-checkpoint-eventlog/{job_id}"
+export_path: '{work_dir}/processed.jsonl'
+np: 8
+
+executor_type: "ray"  # Non-partitioned Ray executor (control config)
+ray_address: "auto"
+
+# Process pipeline with real DataJuicer operations
+process:
+  # Text cleaning operations
+  - clean_links_mapper:
+      text_key: "text"
+      min_links: 0
+      max_links: 10
+
+  - clean_email_mapper:
+      text_key: "text"
+      min_emails: 0
+      max_emails: 5
+
+  - whitespace_normalization_mapper:
+      text_key: "text"
+
+  - fix_unicode_mapper:
+      text_key: "text"
+
+  # Text filtering operations
+  - text_length_filter:
+      text_key: "text"
+      min_len: 5
+      max_len: 10000
+
+  - alphanumeric_filter:
+      text_key: "text"
+      min_ratio: 0.1
+
+  # Quality filtering
+  - character_repetition_filter:
+      text_key: "text"
+      min_ratio: 0.0
+      max_ratio: 0.5
+
+  - word_repetition_filter:
+      text_key: "text"
+      min_ratio: 0.0
+      max_ratio: 0.5
+
+  - ray_bts_minhash_deduplicator:
+      tokenization: 'character'
+      lowercase: true
+      union_find_parallel_num: 2
+
+# Export configuration
+export_in_parallel: true
+keep_stats_in_res_ds: true
+keep_hashes_in_res_ds: true
+
+# =============================================================================
+# USAGE:
+# =============================================================================
+# This control config uses the non-partitioned Ray executor for comparison.
+# To use this config:
+#
+#    dj-process --config configs/demo/partition-checkpoint-eventlog-control.yaml
+#
+# For the partitioned executor version, use:
+#    dj-process --config configs/demo/partition-checkpoint-eventlog.yaml
+#
+# =============================================================================
diff --git a/configs/demo/partition-checkpoint-eventlog.yaml b/configs/demo/partition-checkpoint-eventlog.yaml
@@ -0,0 +1,155 @@
+# =============================================================================
+# COMPREHENSIVE DATAJUICER DEMO: Checkpointing, Event Logging & Job Management
+# =============================================================================
+# This demo showcases:
+# 1. Configurable checkpointing strategies
+# 2. Event logging with job-specific directories
+# 3. Flexible storage architecture
+# 4. Job resumption capabilities
+# 5. Real DataJuicer operations
+# =============================================================================
+
+# Data location configuration (Mandatory)
+dataset_path: './demos/data/demo-dataset.jsonl'
+
+# Work directory configuration
+# IMPORTANT: If using {job_id} placeholder, it MUST be the last part of the path
+# Examples:
+#   ✅ work_dir: "./outputs/my_project/{job_id}"     # Valid
+#   ✅ work_dir: "/data/experiments/{job_id}"        # Valid
+#   ❌ work_dir: "./outputs/{job_id}/results"        # Invalid - {job_id} not at end
+#   ❌ work_dir: "./{job_id}/outputs/data"           # Invalid - {job_id} not at end
+#
+# If no {job_id} is specified, job_id will be automatically appended:
+#   work_dir: "./outputs/my_project" → job_dir: "./outputs/my_project/20250804_143022_abc123"
+work_dir: "./outputs/partition-checkpoint-eventlog/{job_id}"
+export_path: '{work_dir}/processed.jsonl'
+
+# Executor configuration
+executor_type: "ray_partitioned"  # Use our enhanced partitioned executor
+ray_address: "auto"
+# np will be auto-configured based on available cluster resources when partition.auto_configure: true
+# np: 2  # Number of Ray workers (auto-configured when partition.auto_configure: true)
+
+# Separate storage configuration
+# Partition directory (Optional) is used to store the partitions of the dataset if using ray_partitioned executor
+partition_dir: "{work_dir}/partitions"
+
+# Event logs: Fast storage (SSD, local disk) - small files, frequent writes (Optional)
+event_log_dir: "{work_dir}/event_logs"  # Optional: separate fast storage for event logs
+
+# Checkpoints: Large storage (HDD, network storage) - large files, infrequent writes (Optional)
+checkpoint_dir: "{work_dir}/checkpoints"  # Optional: separate large storage for checkpoints
+
+
+# Partition configuration
+partition:
+  mode: "manual"                      # Auto partition mode - optimal partitioning
+  num_of_partitions: 4  # Number of partitions to create
+
+
+# Checkpoint configuration
+checkpoint:
+  enabled: false
+  strategy: "every_n_ops"
+  n_ops: 3
+  # strategy: "every_op"  # every_op, every_partition, every_n_ops, manual, disabled
+  # n_ops: 1  # Number of operations between checkpoints (for every_n_ops strategy)
+  # op_names: []  # Specific operation names to checkpoint after (for manual strategy)
+
+# Intermediate storage configuration (includes file lifecycle management)
+intermediate_storage:
+  format: "parquet"  # parquet, arrow, jsonl; defaults to parquet
+  write_partitions: false
+
+# Event logging configuration
+event_logging:
+  enabled: true
+
+# Process pipeline with real DataJuicer operations
+process:
+  # Text cleaning operations
+  - clean_links_mapper:
+      text_key: "text"
+      min_links: 0
+      max_links: 10
+
+  - clean_email_mapper:
+      text_key: "text"
+      min_emails: 0
+      max_emails: 5
+
+  - whitespace_normalization_mapper:
+      text_key: "text"
+
+  - fix_unicode_mapper:
+      text_key: "text"
+
+  # Text filtering operations
+  - text_length_filter:
+      text_key: "text"
+      min_len: 5
+      max_len: 10000
+
+  - alphanumeric_filter:
+      text_key: "text"
+      min_ratio: 0.1
+
+  # Quality filtering
+  - character_repetition_filter:
+      text_key: "text"
+      min_ratio: 0.0
+      max_ratio: 0.5
+
+  - word_repetition_filter:
+      text_key: "text"
+      min_ratio: 0.0
+      max_ratio: 0.5
+
+  - ray_bts_minhash_deduplicator:
+      tokenization: 'character'
+      lowercase: true
+      union_find_parallel_num: 2
+
+# Export configuration
+export_in_parallel: true
+keep_stats_in_res_ds: true
+keep_hashes_in_res_ds: true
+
+
+# =============================================================================
+# COMPLETE USER EXPERIENCE:
+# =============================================================================
+# 1. Start job:
+#    dj-process --config configs/demo/partition-checkpoint-eventlog.yaml
+#    # Output shows: Job ID (timestamp_configname_suffix), job directory, resumption command
+#    # Example: 20241201_143022_partition-checkpoint-eventlog_abc123
+#
+# 2. If job fails, resume with:
+#    dj-process --config configs/demo/partition-checkpoint-eventlog.yaml --job_id <job_id>
+#    # System validates job_id and shows previous status
+#
+# 3. Directory structure (flexible storage):
+#    outputs/partition-checkpoint-eventlog/{job_id}/
+#    ├── partitions/           # Dataset partitions (large files)
+#    ├── checkpoints/          # Operation checkpoints (large files)
+#    ├── event_logs/           # Event logs (small files, frequent writes)
+#    ├── metadata/             # Job metadata and mapping
+#    ├── results/              # Final processed dataset
+#    └── processed.jsonl       # Final output file
+#
+# 4. Resource Optimization:
+#    - partition.mode: "auto" automatically optimizes:
+#      * Partition size based on data characteristics and available memory
+#      * Number of partitions based on dataset size and optimal partition size
+#      * Worker count (np) based on available CPU cores
+#      * Processing efficiency based on data modality (text, image, audio, video)
+#    - No manual tuning required - system adapts to your hardware and data
+#
+# 5. Monitoring and Debugging:
+#    - Real-time event logs in event_logs/ directory
+#    - Processing summary with statistics and timing
+#    - Checkpoint recovery for fault tolerance
+#    - Detailed resource utilization analysis
+#
+# =============================================================================
diff --git a/data_juicer/config/__init__.py b/data_juicer/config/__init__.py
@@ -6,7 +6,10 @@
     merge_config,
     prepare_cfgs_for_export,
     prepare_side_configs,
+    resolve_job_directories,
+    resolve_job_id,
     update_op_attr,
+    validate_work_dir_config,
 )
 
 __all__ = [
@@ -18,4 +21,7 @@
     "get_default_cfg",
     "prepare_cfgs_for_export",
     "update_op_attr",
+    "validate_work_dir_config",
+    "resolve_job_id",
+    "resolve_job_directories",
 ]