@@ -130,56 +130,66 @@ CREATE TRIGGER update_clean_template_updated_at
130130-- 插入初始数据 - 清洗模板
131131INSERT INTO t_clean_template (id, name, description)
132132VALUES
133- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' 文本清洗模板' , ' 文本清洗模板' ),
134- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' 图片清洗模板' , ' 图片清洗模板' )
135- ON CONFLICT (id) DO NOTHING;
133+ (' 550e8400-e29b-41d4-a716-446655440001' , ' 安全与隐私合规处理模板' , ' 针对敏感数据进行严格清洗,移除PII(个人身份信息)、政治敏感、暴力色情内容,适用于模型对外发布前的安全合规检查。' ),
134+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' LLM SFT高质量文本清洗模板' , ' 旨在生成高质量、低噪声的训练数据。包含去除乱码、重复内容、繁简转换、全角转半角以及格式标准化处理。' ),
135+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' RAG知识库构建预处理模板' , ' 专为RAG场景设计。重点去除目录、图注、XML/HTML标签等对向量检索无意义的噪声,并进行段落级去重以优化切片质量。' ),
136+ (' 883b1722-b5ce-7407-d049-779988770004' , ' 原始Web爬虫数据清洗模板' , ' 针对互联网爬取的脏数据进行清洗。重点去除Emoji表情、URL链接、HTML标签以及不可见字符。' ),
137+ (' 994c2833-c6df-8518-e150-880099880005' , ' 多模态/CV模型训练预处理模板' , ' 针对图像数据集处理。包含去除模糊/重复/相似图片,图片方向校正,目标检测预标注,以及尺寸和格式的统一化。' )
138+ ON CONFLICT (id) DO NOTHING;
136139
137- -- 插入初始数据 - 操作员实例(文本清洗模板)
138140INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
139141VALUES
140- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' FileWithShortOrLongLengthFilter' , 1 , NULL ),
141- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' FileWithHighRepeatWordRateFilter' , 2 , NULL ),
142- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' FileWithHighRepeatPhraseRateFilter' , 3 , NULL ),
143- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' FileWithHighSpecialCharRateFilter' , 4 , NULL ),
144- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' FileWithManySensitiveWordsFilter' , 5 , NULL ),
145- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' UnicodeSpaceCleaner' , 6 , NULL ),
146- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' ExtraSpaceCleaner' , 7 , NULL ),
147- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' FullWidthCharacterCleaner' , 8 , NULL ),
148- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' InvisibleCharactersCleaner' , 9 , NULL ),
149- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' ContentCleaner' , 10 , NULL ),
150- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' LegendCleaner' , 11 , NULL ),
151- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' EmojiCleaner' , 12 , NULL ),
152- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' HtmlTagCleaner' , 13 , NULL ),
153- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' TraditionalChineseCleaner' , 14 , NULL ),
154- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' GrableCharactersCleaner' , 15 , NULL ),
155- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' XMLTagCleaner' , 16 , NULL ),
156- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' DuplicateSentencesFilter' , 17 , NULL ),
157- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' DuplicateFilesFilter' , 18 , NULL ),
158- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' SexualAndViolentWordCleaner' , 19 , NULL ),
159- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' PoliticalWordCleaner' , 20 , NULL ),
160- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' AnonymizedPhoneNumber' , 21 , NULL ),
161- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' AnonymizedCreditCardNumber' , 22 , NULL ),
162- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' EmailNumberCleaner' , 23 , NULL ),
163- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' AnonymizedIpAddress' , 24 , NULL ),
164- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' AnonymizedIdNumber' , 25 , NULL ),
165- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' AnonymizedUrlCleaner' , 26 , NULL ),
166- (' 26ae585c-8310-4679-adc0-e53215e6e69b' , ' PiiDetector' , 27 , NULL )
167- ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
168-
169- -- 插入初始数据 - 操作员实例(图片清洗模板)
142+ (' 550e8400-e29b-41d4-a716-446655440001' , ' PoliticalWordCleaner' , 1 , NULL ),
143+ (' 550e8400-e29b-41d4-a716-446655440001' , ' SexualAndViolentWordCleaner' , 2 , NULL ),
144+ (' 550e8400-e29b-41d4-a716-446655440001' , ' PiiDetector' , 3 , NULL ),
145+ (' 550e8400-e29b-41d4-a716-446655440001' , ' AnonymizedIdNumber' , 4 , NULL ),
146+ (' 550e8400-e29b-41d4-a716-446655440001' , ' AnonymizedCreditCardNumber' , 5 , NULL ),
147+ (' 550e8400-e29b-41d4-a716-446655440001' , ' AnonymizedPhoneNumber' , 6 , NULL ),
148+ (' 550e8400-e29b-41d4-a716-446655440001' , ' EmailNumberCleaner' , 7 , NULL ),
149+ (' 550e8400-e29b-41d4-a716-446655440001' , ' AnonymizedIpAddress' , 8 , NULL )
150+ ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
151+
152+ INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
153+ VALUES
154+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' GrableCharactersCleaner' , 1 , NULL ),
155+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' InvisibleCharactersCleaner' , 2 , NULL ),
156+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' FullWidthCharacterCleaner' , 3 , NULL ),
157+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' TraditionalChineseCleaner' , 4 , NULL ),
158+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' FileWithShortOrLongLengthFilter' , 5 , ' {"fileLength": [50, 8192]}' ),
159+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' FileWithHighRepeatPhraseRateFilter' , 6 , NULL ),
160+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' FileWithHighSpecialCharRateFilter' , 7 , NULL ),
161+ (' 661f9500-f3ac-52e5-b827-557766550002' , ' DuplicateFilesFilter' , 8 , NULL )
162+ ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
163+
164+
165+ INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
166+ VALUES
167+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' HtmlTagCleaner' , 1 , ' {"removeTableTags": "false"}' ), -- 表格对RAG可能有价值,暂不去除表格
168+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' ContentCleaner' , 2 , NULL ),
169+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' LegendCleaner' , 3 , NULL ),
170+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' XMLTagCleaner' , 4 , NULL ),
171+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' UnicodeSpaceCleaner' , 5 , NULL ),
172+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' ExtraSpaceCleaner' , 6 , NULL ),
173+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' DuplicateSentencesFilter' , 7 , NULL ),
174+ (' 772a0611-a4bd-63f6-c938-668877660003' , ' FileWithShortOrLongLengthFilter' , 8 , ' {"fileLength": [20, 100000]}' )
175+ ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
176+
177+ INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
178+ VALUES
179+ (' 883b1722-b5ce-7407-d049-779988770004' , ' HtmlTagCleaner' , 1 , ' {"removeTableTags": "true"}' ),
180+ (' 883b1722-b5ce-7407-d049-779988770004' , ' AnonymizedUrlCleaner' , 2 , NULL ),
181+ (' 883b1722-b5ce-7407-d049-779988770004' , ' EmojiCleaner' , 3 , NULL ),
182+ (' 883b1722-b5ce-7407-d049-779988770004' , ' InvisibleCharactersCleaner' , 4 , NULL ),
183+ (' 883b1722-b5ce-7407-d049-779988770004' , ' ExtraSpaceCleaner' , 5 , NULL ),
184+ (' 883b1722-b5ce-7407-d049-779988770004' , ' DuplicateFilesFilter' , 6 , ' {"fileDuplicateThreshold": 0.6}' )
185+ ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
186+
170187INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override)
171188VALUES
172- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgBlurredImagesCleaner' , 1 , NULL ),
173- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgDuplicatedImagesCleaner' , 2 , NULL ),
174- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgSimilarImagesCleaner' , 3 , NULL ),
175- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgBrightness' , 4 , NULL ),
176- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgContrast' , 5 , NULL ),
177- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgSaturation' , 6 , NULL ),
178- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgSharpness' , 7 , NULL ),
179- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgDenoise' , 8 , NULL ),
180- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgShadowRemove' , 9 , NULL ),
181- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgPerspectiveTransformation' , 10 , NULL ),
182- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgDirectionCorrect' , 11 , NULL ),
183- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgResize' , 12 , NULL ),
184- (' 4421504e-c6c9-4760-b55a-509d17429597' , ' ImgTypeUnify' , 13 , NULL )
185- ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
189+ (' 994c2833-c6df-8518-e150-880099880005' , ' ImgBlurredImagesCleaner' , 1 , NULL ),
190+ (' 994c2833-c6df-8518-e150-880099880005' , ' ImgDuplicatedImagesCleaner' , 2 , NULL ),
191+ (' 994c2833-c6df-8518-e150-880099880005' , ' ImgSimilarImagesCleaner' , 3 , NULL ),
192+ (' 994c2833-c6df-8518-e150-880099880005' , ' ImgDirectionCorrect' , 4 , NULL ),
193+ (' 994c2833-c6df-8518-e150-880099880005' , ' ImgResize' , 5 , ' {"widthSize": 512, "heightSize": 512}' ),
194+ (' 994c2833-c6df-8518-e150-880099880005' , ' ImgTypeUnify' , 6 , ' {"imgType": "jpg"}' )
195+ ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
0 commit comments