增加数据库插入

hhhhsc701 · hhhhsc701 · commit 716ccaa14ec2 · 2026-01-27T19:31:21.000+08:00
diff --git a/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx b/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx
@@ -28,7 +28,7 @@ const ParamConfig: React.FC<ParamConfigProps> = ({
   if (!param) return null;
   let defaultVal: any = param.defaultVal;
   if (param.type === "range") {
-    
+
     defaultVal = Array.isArray(param.defaultVal)
       ? param.defaultVal
       : [
@@ -219,7 +219,7 @@ const ParamConfig: React.FC<ParamConfigProps> = ({
           <Switch
             checkedChildren={param.checkedLabel}
             unCheckedChildren={param.unCheckedLabel}
-            defaultChecked={param.defaultVal === 'true'}
+            defaultChecked={String(param.defaultVal).toLowerCase() === 'true'}
             onChange={(checked) => updateValue(checked)}
           />
         </Form.Item>
diff --git a/runtime/python-executor/datamate/common/utils/file_scanner.py b/runtime/python-executor/datamate/common/utils/file_scanner.py
@@ -0,0 +1,127 @@
+import os
+import uuid
+from loguru import logger
+import mimetypes
+from datetime import datetime
+
+from datamate.sql_manager.persistence_atction import TaskInfoPersistence
+
+class FileScanner:
+    def __init__(self, dataset_id):
+        self.dataset_id = dataset_id
+        self.persistence = TaskInfoPersistence()
+
+    def get_existing_paths(self):
+        """
+        优化点1：一次性获取所有已存在的文件路径，存为 Set
+        """
+        logger.info("Fetching existing files from DB...")
+        existing_files = self.persistence.query_existing_files(self.dataset_id)
+        # fetchall 返回的是 list of tuples [('path1',), ('path2',)]
+        # 我们将其转换为 set {'path1', 'path2'} 以实现 O(1) 查找速度
+        existing_set = {row[0] for row in existing_files}
+        logger.info(f"Found {len(existing_set)} existing files in DB.")
+        return existing_set
+
+    def prepare_file_data(self, sample, file_id):
+        """
+        优化点2：这部分逻辑从原来的 update_file_result 剥离出来
+        只负责数据组装，不负责插入数据库
+        """
+        file_size = str(sample.get("fileSize"))
+        file_type = str(sample.get("fileType"))
+        file_name = str(sample.get("fileName"))
+        dataset_id = str(sample.get("dataset_id"))
+        file_path = str(sample.get("filePath"))
+        create_time = datetime.now()
+
+        # 获取最后访问时间，增加异常处理
+        try:
+            last_access_time = datetime.fromtimestamp(os.path.getmtime(file_path))
+        except (FileNotFoundError, OSError):
+            last_access_time = create_time
+
+        # 返回字典，供 executemany 使用
+        return {
+            "id": file_id,
+            "dataset_id": dataset_id,
+            "file_name": file_name,
+            "file_path": file_path,
+            "file_type": file_type,
+            "file_size": file_size,
+            "status": "COMPLETED",
+            "upload_time": create_time,
+            "last_access_time": last_access_time,
+            "created_at": create_time,
+            "updated_at": create_time
+        }
+
+    def scan_and_process(self, root_dir, batch_size=5000):
+        logger.info(f"Scanning directory: {root_dir}")
+
+        # 1. 内存中收集扫描到的所有文件 {path: metadata_dict}
+        scanned_files_map = {}
+
+        for root, dirs, files in os.walk(root_dir):
+            for file in files:
+                if file.startswith('.'): continue
+
+                full_path = os.path.join(root, file)
+
+                # 预先收集元数据
+                try:
+                    stats = os.stat(full_path)
+                    f_type, _ = mimetypes.guess_type(full_path)
+                    if not f_type: f_type = os.path.splitext(file)[1]
+
+                    # 构造 sample 格式
+                    scanned_files_map[full_path] = {
+                        "fileSize": stats.st_size,
+                        "fileType": f_type,
+                        "fileName": file,
+                        "dataset_id": self.dataset_id,
+                        "filePath": full_path
+                    }
+                except OSError:
+                    continue
+
+        logger.info(f"Scanned {len(scanned_files_map)} files on disk.")
+
+        # 2. 获取数据库中已有的路径
+        existing_paths = self.get_existing_paths()
+
+        # 3. 内存做差集 (Set Difference) -> 找出需要新增的路径
+        scanned_paths_set = set(scanned_files_map.keys())
+        new_paths = list(scanned_paths_set - existing_paths)
+
+        logger.info(f"Need to insert {len(new_paths)} new files.")
+
+        if not new_paths:
+            logger.info("No new files to insert.")
+            return
+
+        # 4. 准备批量插入的数据
+        insert_batch = []
+        total_inserted = 0
+
+        for path in new_paths:
+            sample_data = scanned_files_map[path]
+            new_file_id = str(uuid.uuid4())
+
+            # 调用转换逻辑
+            record = self.prepare_file_data(sample_data, new_file_id)
+            insert_batch.append(record)
+
+            # 优化点3：分批执行，防止一次性数据包过大导致 SQL 报错
+            if len(insert_batch) >= batch_size:
+                self.persistence.batch_insert_files(insert_batch)
+                total_inserted += len(insert_batch)
+                logger.info(f"Progress: {total_inserted}/{len(new_paths)} inserted...")
+                insert_batch = [] # 清空列表
+
+        # 插入剩余的数据
+        if insert_batch:
+            self.persistence.batch_insert_files(insert_batch)
+            total_inserted += len(insert_batch)
+
+        logger.info(f"Done. Total inserted: {total_inserted}")
diff --git a/runtime/python-executor/datamate/sql_manager/persistence_atction.py b/runtime/python-executor/datamate/sql_manager/persistence_atction.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 
 import json
-import time
 import os
+import time
 import uuid
 from datetime import datetime
 from pathlib import Path
@@ -78,6 +78,18 @@ def update_file_result(self, sample, file_id):
         }
         self.insert_result(file_data, str(self.sql_dict.get("insert_dataset_file_sql")))
 
+    def query_existing_files(self, dataset_id: str):
+        result = None
+        query_sql = str(self.sql_dict.get("query_dataset_files_sql"))
+        with SQLManager.create_connect() as conn:
+            execute_result = conn.execute(text(query_sql), {"dataset_id": dataset_id})
+            result = execute_result.fetchall()
+        return result
+
+    def batch_insert_files(self, samples):
+        insert_sql = str(self.sql_dict.get("insert_dataset_file_sql"))
+        self.batch_execute(insert_sql, samples)
+
     def persistence_task_info(self, sample: Dict[str, Any]):
         file_id = str(uuid.uuid4())
         self.update_task_result(sample, file_id)
@@ -102,6 +114,22 @@ def insert_result(data, sql):
                     raise RuntimeError(82000, str(e)) from None
         raise Exception("Max retries exceeded")
 
+    @staticmethod
+    def batch_execute(sql, args_list):
+        """
+        批量执行 SQL
+        :param sql: SQL 语句 (例如: "INSERT INTO table (a, b) VALUES (%s, %s)")
+        :param args_list: 参数列表 (例如: [(1, 2), (3, 4), ...])
+        """
+        # 获取连接
+        with SQLManager.create_connect() as conn:
+            try:
+                conn.execute(text(sql), args_list)
+            except Exception as e:
+                conn.rollback()
+                logger.error(f"批量插入失败: {e}")
+                raise e
+
     def update_result(self, dataset_id, instance_id, status):
         dataset_data = {
             "dataset_id": dataset_id
diff --git a/runtime/python-executor/datamate/sql_manager/sql/sql_config.json b/runtime/python-executor/datamate/sql_manager/sql/sql_config.json
@@ -13,5 +13,6 @@
   "create_similar_img_tables_sql": "CREATE TABLE IF NOT EXISTS operator_similar_img_features (id SERIAL PRIMARY KEY, task_uuid VARCHAR(255), p_hash TEXT, des_matrix BYTEA, matrix_shape TEXT, file_name TEXT, timestamp TIMESTAMP);",
   "delete_similar_img_tables_sql": "DELETE FROM operator_similar_img_features WHERE flow_id = :flow_id",
   "create_similar_text_tables_sql": "CREATE TABLE IF NOT EXISTS operators_similar_text_features (id SERIAL PRIMARY KEY, task_uuid VARCHAR(255), file_feature TEXT, file_name TEXT, timestamp TIMESTAMP);",
-  "delete_similar_text_tables_sql": "DELETE FROM operators_similar_text_features WHERE flow_id = :flow_id"
-}
+  "delete_similar_text_tables_sql": "DELETE FROM operators_similar_text_features WHERE flow_id = :flow_id",
+  "query_dataset_files_sql": "SELECT file_path FROM t_dm_dataset_files WHERE dataset_id = :dataset_id"
+}
diff --git a/runtime/python-executor/datamate/wrappers/datamate_executor.py b/runtime/python-executor/datamate/wrappers/datamate_executor.py
@@ -47,6 +47,7 @@ def run(self):
         for _ in dataset.data.iter_batches():
             pass
 
+        self.scan_files()
 
 if __name__ == '__main__':
 
diff --git a/runtime/python-executor/datamate/wrappers/executor.py b/runtime/python-executor/datamate/wrappers/executor.py
@@ -2,6 +2,7 @@
 import time
 from typing import Dict
 
+from datamate.common.utils.file_scanner import FileScanner
 import ray
 from jsonargparse import dict_to_namespace
 from loguru import logger
@@ -77,4 +78,8 @@ def load_dataset(self, jsonl_file_path = None):
 
     def update_db(self, status):
         task_info = TaskInfoPersistence()
-        task_info.update_result(self.cfg.dataset_id, self.cfg.instance_id, status)
+        task_info.update_result(self.cfg.dataset_id, self.cfg.instance_id, status)
+
+    def scan_files(self):
+        scanner = FileScanner(self.cfg.dataset_id)
+        scanner.scan_and_process(self.cfg.export_path)