Skip to content

Commit 7a1a536

Browse files
authored
develop/template (#366)
* feat: 重新组织清洗模板 * 模版拆分适配
1 parent dc743c7 commit 7a1a536

File tree

5 files changed

+79
-68
lines changed

5 files changed

+79
-68
lines changed

runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ async def stop_cleaning_task(
181181
"""Stop cleaning task"""
182182
task_service = _get_task_service(db)
183183
await task_service.stop_task(db, task_id)
184+
await db.commit()
184185
return StandardResponse(code="0", message="success", data=task_id)
185186

186187

runtime/ops/mapper/img_resize/metadata.yml

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,19 @@ effect:
1515
inputs: 'image'
1616
outputs: 'image'
1717
settings:
18-
targetSize:
19-
name: 重采样尺寸
20-
type: multiple
21-
properties:
22-
- type: inputNumber
23-
name: 宽度
24-
description: 像素
25-
defaultVal: 256
26-
min: 1
27-
max: 4096
28-
step: 1
29-
- type: inputNumber
30-
name: 高度
31-
description: 像素
32-
defaultVal: 256
33-
min: 1
34-
max: 4096
35-
step: 1
18+
widthSize:
19+
name: 宽度
20+
type: inputNumber
21+
description: 像素
22+
defaultVal: 256
23+
min: 1
24+
max: 4096
25+
step: 1
26+
heightSize:
27+
type: inputNumber
28+
name: 高度
29+
description: 像素
30+
defaultVal: 256
31+
min: 1
32+
max: 4096
33+
step: 1

runtime/ops/mapper/img_resize/process.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
class ImgResize(Mapper):
1818
def __init__(self, *args, **kwargs):
1919
super(ImgResize, self).__init__(*args, **kwargs)
20-
self._target_size = kwargs.get("targetSize", [256, 256])
20+
self._width = int(kwargs.get("widthSize", 256))
21+
self._height = int(kwargs.get("heightSize", 256))
22+
self._target_size = [self._width, self._height]
2123

2224
@classmethod
2325
def _img_resize(cls, data: List[float], target_size: List[int]) -> List[float]:

scripts/db/data-cleaning-init.sql

Lines changed: 58 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -130,56 +130,66 @@ CREATE TRIGGER update_clean_template_updated_at
130130
-- 插入初始数据 - 清洗模板
131131
INSERT INTO t_clean_template (id, name, description)
132132
VALUES
133-
('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清洗模板'),
134-
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板')
135-
ON CONFLICT (id) DO NOTHING;
133+
('550e8400-e29b-41d4-a716-446655440001', '安全与隐私合规处理模板', '针对敏感数据进行严格清洗,移除PII(个人身份信息)、政治敏感、暴力色情内容,适用于模型对外发布前的安全合规检查。'),
134+
('661f9500-f3ac-52e5-b827-557766550002', 'LLM SFT高质量文本清洗模板', '旨在生成高质量、低噪声的训练数据。包含去除乱码、重复内容、繁简转换、全角转半角以及格式标准化处理。'),
135+
('772a0611-a4bd-63f6-c938-668877660003', 'RAG知识库构建预处理模板', '专为RAG场景设计。重点去除目录、图注、XML/HTML标签等对向量检索无意义的噪声,并进行段落级去重以优化切片质量。'),
136+
('883b1722-b5ce-7407-d049-779988770004', '原始Web爬虫数据清洗模板', '针对互联网爬取的脏数据进行清洗。重点去除Emoji表情、URL链接、HTML标签以及不可见字符。'),
137+
('994c2833-c6df-8518-e150-880099880005', '多模态/CV模型训练预处理模板', '针对图像数据集处理。包含去除模糊/重复/相似图片,图片方向校正,目标检测预标注,以及尺寸和格式的统一化。')
138+
ON CONFLICT (id) DO NOTHING;
136139

137-
-- 插入初始数据 - 操作员实例(文本清洗模板)
138140
INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
139141
VALUES
140-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 1, NULL),
141-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 2, NULL),
142-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 3, NULL),
143-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 4, NULL),
144-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 5, NULL),
145-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 6, NULL),
146-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 7, NULL),
147-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 8, NULL),
148-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 9, NULL),
149-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 10, NULL),
150-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 11, NULL),
151-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 12, NULL),
152-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 13, NULL),
153-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 14, NULL),
154-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 15, NULL),
155-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 16, NULL),
156-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 17, NULL),
157-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 18, NULL),
158-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 19, NULL),
159-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 20, NULL),
160-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 21, NULL),
161-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 22, NULL),
162-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 23, NULL),
163-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 24, NULL),
164-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 25, NULL),
165-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 26, NULL),
166-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PiiDetector', 27, NULL)
167-
ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
168-
169-
-- 插入初始数据 - 操作员实例(图片清洗模板)
142+
('550e8400-e29b-41d4-a716-446655440001', 'PoliticalWordCleaner', 1, NULL),
143+
('550e8400-e29b-41d4-a716-446655440001', 'SexualAndViolentWordCleaner', 2, NULL),
144+
('550e8400-e29b-41d4-a716-446655440001', 'PiiDetector', 3, NULL),
145+
('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedIdNumber', 4, NULL),
146+
('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedCreditCardNumber', 5, NULL),
147+
('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedPhoneNumber', 6, NULL),
148+
('550e8400-e29b-41d4-a716-446655440001', 'EmailNumberCleaner', 7, NULL),
149+
('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedIpAddress', 8, NULL)
150+
ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
151+
152+
INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
153+
VALUES
154+
('661f9500-f3ac-52e5-b827-557766550002', 'GrableCharactersCleaner', 1, NULL),
155+
('661f9500-f3ac-52e5-b827-557766550002', 'InvisibleCharactersCleaner', 2, NULL),
156+
('661f9500-f3ac-52e5-b827-557766550002', 'FullWidthCharacterCleaner', 3, NULL),
157+
('661f9500-f3ac-52e5-b827-557766550002', 'TraditionalChineseCleaner', 4, NULL),
158+
('661f9500-f3ac-52e5-b827-557766550002', 'FileWithShortOrLongLengthFilter', 5, '{"fileLength": [50, 8192]}'),
159+
('661f9500-f3ac-52e5-b827-557766550002', 'FileWithHighRepeatPhraseRateFilter', 6, NULL),
160+
('661f9500-f3ac-52e5-b827-557766550002', 'FileWithHighSpecialCharRateFilter', 7, NULL),
161+
('661f9500-f3ac-52e5-b827-557766550002', 'DuplicateFilesFilter', 8, NULL)
162+
ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
163+
164+
165+
INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
166+
VALUES
167+
('772a0611-a4bd-63f6-c938-668877660003', 'HtmlTagCleaner', 1, '{"removeTableTags": "false"}'), -- 表格对RAG可能有价值,暂不去除表格
168+
('772a0611-a4bd-63f6-c938-668877660003', 'ContentCleaner', 2, NULL),
169+
('772a0611-a4bd-63f6-c938-668877660003', 'LegendCleaner', 3, NULL),
170+
('772a0611-a4bd-63f6-c938-668877660003', 'XMLTagCleaner', 4, NULL),
171+
('772a0611-a4bd-63f6-c938-668877660003', 'UnicodeSpaceCleaner', 5, NULL),
172+
('772a0611-a4bd-63f6-c938-668877660003', 'ExtraSpaceCleaner', 6, NULL),
173+
('772a0611-a4bd-63f6-c938-668877660003', 'DuplicateSentencesFilter', 7, NULL),
174+
('772a0611-a4bd-63f6-c938-668877660003', 'FileWithShortOrLongLengthFilter', 8, '{"fileLength": [20, 100000]}')
175+
ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
176+
177+
INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
178+
VALUES
179+
('883b1722-b5ce-7407-d049-779988770004', 'HtmlTagCleaner', 1, '{"removeTableTags": "true"}'),
180+
('883b1722-b5ce-7407-d049-779988770004', 'AnonymizedUrlCleaner', 2, NULL),
181+
('883b1722-b5ce-7407-d049-779988770004', 'EmojiCleaner', 3, NULL),
182+
('883b1722-b5ce-7407-d049-779988770004', 'InvisibleCharactersCleaner', 4, NULL),
183+
('883b1722-b5ce-7407-d049-779988770004', 'ExtraSpaceCleaner', 5, NULL),
184+
('883b1722-b5ce-7407-d049-779988770004', 'DuplicateFilesFilter', 6, '{"fileDuplicateThreshold": 0.6}')
185+
ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
186+
170187
INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
171188
VALUES
172-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 1, NULL),
173-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 2, NULL),
174-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 3, NULL),
175-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 4, NULL),
176-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 5, NULL),
177-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 6, NULL),
178-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 7, NULL),
179-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 8, NULL),
180-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 9, NULL),
181-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 10, NULL),
182-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 11, NULL),
183-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, NULL),
184-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, NULL)
185-
ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
189+
('994c2833-c6df-8518-e150-880099880005', 'ImgBlurredImagesCleaner', 1, NULL),
190+
('994c2833-c6df-8518-e150-880099880005', 'ImgDuplicatedImagesCleaner', 2, NULL),
191+
('994c2833-c6df-8518-e150-880099880005', 'ImgSimilarImagesCleaner', 3, NULL),
192+
('994c2833-c6df-8518-e150-880099880005', 'ImgDirectionCorrect', 4, NULL),
193+
('994c2833-c6df-8518-e150-880099880005', 'ImgResize', 5, '{"widthSize": 512, "heightSize": 512}'),
194+
('994c2833-c6df-8518-e150-880099880005', 'ImgTypeUnify', 6, '{"imgType": "jpg"}')
195+
ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;

scripts/db/data-operator-init.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ VALUES
207207
('ImgDenoise', '图片噪点去除', '去除图片中的噪点,主要适用于自然场景。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'),
208208
('ImgDuplicatedImagesCleaner', '重复图片去除', '去除重复的图片。', '1.0.0', 'image', 'image', null, null, '', 8192, false, 'system', 'system'),
209209
('ImgPerspectiveTransformation', '图片透视变换', '自适应校正图片的视角,主要适用于文档校正场景。', '1.0.0', 'image', 'image', null, null, '', 8192, false, 'system', 'system'),
210-
('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', null, '{"targetSize": {"name": "重采样尺寸", "name_en": "Resample Size", "type": "multiple", "properties": [{"type": "inputNumber", "name": "宽度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}, {"type": "inputNumber", "name": "高度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}]}}', '', 8192, false, 'system', 'system'),
210+
('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', null, '{"widthSize":{"name":"宽度","type":"inputNumber","description":"像素","defaultVal":256,"min":1,"max":4096,"step":1},"heightSize":{"type":"inputNumber","name":"高度","description":"像素","defaultVal":256,"min":1,"max":4096,"step":1}}', '', 8192, false, 'system', 'system'),
211211
('ImgSaturation', '图片饱和度增强', '自适应调节图片的饱和度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'),
212212
('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'),
213213
('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'),

0 commit comments

Comments
 (0)