Skip to content

Commit d0b3ae0

Browse files
authored
Merge pull request #468 from unit-mesh/feat/domain-dict-optimization
feat(domain-dict): optimize batch processing and add LLM logging
2 parents 0632b7f + e0074fe commit d0b3ae0

File tree

3 files changed

+225
-356
lines changed

3 files changed

+225
-356
lines changed

mpp-core/src/commonMain/kotlin/cc/unitmesh/agent/subagent/DomainDictAgent.kt

Lines changed: 172 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,11 @@ class DomainDictAgent(
161161
insights.domainConcepts.size
162162
)
163163

164+
// Analyze package structure to find important business packages
165+
val importantPackages = analyzePackageStructure(insights, onProgress)
166+
164167
// Extract meaningful names from hot files
165-
val codebaseNames = extractMeaningfulNames(insights, onProgress)
168+
val codebaseNames = extractMeaningfulNames(insights, onProgress, importantPackages)
166169
onProgress(" 📋 Found ${codebaseNames.size} candidate names")
167170

168171
// Filter out existing terms
@@ -185,11 +188,21 @@ class DomainDictAgent(
185188
// ============= Step 2: Generate Entries =============
186189
onProgress("\n## Step 2/3: Generating Entries")
187190

188-
val namesToProcess = newNames.take(500)
189-
onProgress(" 💭 Translating ${namesToProcess.size} terms (of ${newNames.size} total)...")
191+
// Process in smaller batches for faster LLM responses
192+
val batchSize = 100
193+
val maxBatches = 3 // Process at most 3 batches = 300 terms
194+
val namesToProcess = newNames.take(batchSize * maxBatches)
195+
val allNewEntries = mutableListOf<DomainEntry>()
196+
197+
namesToProcess.chunked(batchSize).forEachIndexed { index, batch ->
198+
onProgress(" 💭 Batch ${index + 1}: Translating ${batch.size} terms...")
199+
val batchEntries = generateEntries(batch, callbacks)
200+
allNewEntries.addAll(batchEntries)
201+
onProgress(" ✅ Batch ${index + 1}: Got ${batchEntries.size} entries")
202+
}
190203

191-
val newEntries = generateEntries(namesToProcess, callbacks)
192-
onProgress(" ✅ Generated ${newEntries.size} entries")
204+
val newEntries = allNewEntries
205+
onProgress(" 📊 Total: ${newEntries.size} entries from ${namesToProcess.size} terms")
193206

194207
// Show generated entries
195208
newEntries.take(10).forEach { entry ->
@@ -274,21 +287,95 @@ class DomainDictAgent(
274287
return result
275288
}
276289

277-
suspend fun extractMeaningfulNames(
290+
/**
291+
* Analyze package structure to identify important business packages
292+
* Uses heuristics to prioritize domain/business packages over infrastructure
293+
*/
294+
private fun analyzePackageStructure(
278295
insights: CodebaseInsightsResult,
279296
onProgress: (String) -> Unit
297+
): Set<String> {
298+
onProgress(" 📦 Analyzing package structure...")
299+
300+
// Extract unique packages from hot files
301+
val packageCounts = mutableMapOf<String, Int>()
302+
303+
for (file in insights.hotFiles) {
304+
val path = file.path
305+
// Extract package-like path (e.g., cc/unitmesh/agent from path)
306+
val packagePath = extractPackagePath(path)
307+
if (packagePath.isNotEmpty()) {
308+
packageCounts[packagePath] = (packageCounts[packagePath] ?: 0) + file.changeCount
309+
}
310+
}
311+
312+
// Filter out infrastructure packages
313+
val infrastructurePatterns = setOf(
314+
"test", "config", "util", "utils", "helper", "common",
315+
"generated", "build", "gradle", "node_modules", "target"
316+
)
317+
318+
val businessPackages = packageCounts.filterKeys { pkg ->
319+
val lowerPkg = pkg.lowercase()
320+
infrastructurePatterns.none { lowerPkg.contains(it) }
321+
}
322+
323+
// Sort by change count and take top packages
324+
val topPackages = businessPackages.entries
325+
.sortedByDescending { it.value }
326+
.take(20)
327+
.map { it.key }
328+
.toSet()
329+
330+
if (topPackages.isNotEmpty()) {
331+
onProgress(" 📁 Top business packages:")
332+
topPackages.take(5).forEach { pkg ->
333+
val count = packageCounts[pkg] ?: 0
334+
onProgress("$pkg (${count} changes)")
335+
}
336+
}
337+
338+
return topPackages
339+
}
340+
341+
/**
342+
* Extract package path from file path
343+
* e.g., "src/main/kotlin/cc/unitmesh/agent/Tool.kt" -> "cc/unitmesh/agent"
344+
*/
345+
private fun extractPackagePath(filePath: String): String {
346+
// Remove common source prefixes
347+
val cleanPath = filePath
348+
.replace(Regex("^.*/src/(main|common)/(kotlin|java|scala)/"), "")
349+
.replace(Regex("^.*/src/"), "")
350+
.replace(Regex("^src/(main|common)/(kotlin|java|scala)/"), "")
351+
.replace(Regex("^src/"), "")
352+
353+
// Get directory path (without filename)
354+
val dirPath = cleanPath.substringBeforeLast("/", "")
355+
356+
return dirPath
357+
}
358+
359+
suspend fun extractMeaningfulNames(
360+
insights: CodebaseInsightsResult,
361+
onProgress: (String) -> Unit,
362+
importantPackages: Set<String> = emptySet()
280363
): List<String> {
281364
val hotFileNames = mutableSetOf<String>()
282365
val allConceptNames = mutableSetOf<String>()
283-
// since it's lowly we just disable it
284-
// if (codeParser != null) {
285-
// onProgress(" 🌲 Using TreeSitter to parse hot files...")
286-
// val hotFilesWithCode = parseHotFilesWithTreeSitter(insights.hotFiles, onProgress)
287-
// hotFileNames.addAll(hotFilesWithCode)
288-
// }
366+
367+
// Prioritize files from important packages
368+
val prioritizedFiles = if (importantPackages.isNotEmpty()) {
369+
insights.hotFiles.sortedByDescending { file ->
370+
val pkg = extractPackagePath(file.path)
371+
if (importantPackages.any { pkg.startsWith(it) || it.startsWith(pkg) }) 2 else 1
372+
}
373+
} else {
374+
insights.hotFiles
375+
}
289376

290377
// Also extract from hot file names
291-
for (file in insights.hotFiles) {
378+
for (file in prioritizedFiles) {
292379
val fileName = file.path.substringAfterLast("/").substringBeforeLast(".")
293380
val domainName = extractDomainFromFileName(fileName)
294381
if (domainName != null && isValidDomainName(domainName)) {
@@ -327,19 +414,26 @@ class DomainDictAgent(
327414
}
328415

329416
/**
330-
* Less strict validation for domain concepts (already extracted from code)
417+
* Validation for domain concepts - must be compound names (like "DomainDict", not "Agent")
418+
* Single words are too generic and don't provide business context
331419
*/
332420
private fun isValidDomainConceptName(name: String): Boolean {
333421
if (name.length < 3) return false
334422
if (name.length > 60) return false
335423

336424
val lowerName = name.lowercase()
337425

338-
// Skip very common/generic names
426+
// Skip only pure technical/programming terms (let AI decide business relevance)
339427
val skipExact = setOf(
340-
"unknown", "init", "test", "main", "app", "get", "set", "is", "has",
428+
// Language keywords & primitives
429+
"unknown", "init", "test", "main", "get", "set", "is", "has",
341430
"string", "int", "list", "map", "object", "class", "function",
342-
"true", "false", "null", "void", "return", "if", "else", "for", "while"
431+
"true", "false", "null", "void", "return", "if", "else", "for", "while",
432+
// Pure infrastructure patterns
433+
"impl", "util", "utils", "helper", "helpers", "base", "abstract",
434+
"interface", "default", "common", "internal", "private", "public",
435+
// Build/test artifacts
436+
"spec", "mock", "stub", "fake", "gradle", "build", "index"
343437
)
344438
if (lowerName in skipExact) return false
345439

@@ -349,6 +443,11 @@ class DomainDictAgent(
349443
// Skip special characters
350444
if (name.contains("<") || name.contains(">") || name.contains("$")) return false
351445

446+
// IMPORTANT: Require at least 2 capital letters (compound name)
447+
// This ensures we get "DomainDict" not "Agent"
448+
val capitalCount = name.count { it.isUpperCase() }
449+
if (capitalCount < 2) return false
450+
352451
return true
353452
}
354453

@@ -496,65 +595,34 @@ class DomainDictAgent(
496595

497596
val lowerName = name.lowercase()
498597

499-
// Skip generic/common terms (infrastructure, not domain)
598+
// Skip only pure technical/infrastructure terms
500599
val skipTerms = setOf(
501-
// Testing
600+
// Testing artifacts
502601
"test", "tests", "spec", "mock", "stub", "fake",
503-
// Implementation details
504-
"impl", "util", "utils", "helper", "helpers", "factory",
505-
"base", "abstract", "interface", "default", "common",
506-
// Build/config
507-
"main", "app", "application", "index",
508-
"run", "build", "gradle", "config", "settings",
509-
// Generic programming concepts (too common)
510-
"activity", "action", "event", "listener", "handler", "callback",
511-
"model", "data", "item", "entry", "node", "element",
512-
"list", "map", "set", "array", "collection", "queue",
513-
"context", "state", "status", "type", "kind", "mode",
514-
"info", "detail", "result", "response", "request",
515-
"color", "border", "icon", "image", "font", "style",
516-
"file", "path", "name", "key", "value", "id",
517-
"size", "width", "height", "offset", "padding", "margin",
518-
"consumer", "producer", "provider", "service", "manager",
519-
"builder", "creator", "generator", "loader", "reader", "writer",
520-
"parser", "formatter", "converter", "adapter", "wrapper",
521-
"view", "panel", "dialog", "screen", "page", "component",
522-
"button", "text", "label", "field", "input", "output",
523-
"editor", "renderer", "painter", "drawer",
524-
"exception", "error", "warning", "message",
525-
"checks", "diff", "check", "unknown"
602+
// Pure implementation details
603+
"impl", "util", "utils", "helper", "helpers",
604+
"base", "abstract", "interface", "default", "common", "internal",
605+
// Build/config files
606+
"main", "index", "build", "gradle"
526607
)
527608

528609
// Exact match skip
529610
if (lowerName in skipTerms) return false
530611

531-
// Skip IntelliJ platform concepts (infrastructure)
612+
// Skip IntelliJ/JetBrains platform internals (framework-specific, not business)
532613
val platformTerms = setOf(
533-
"anaction", "applicationmanager", "project", "psifile", "psielement",
534-
"virtualfile", "document", "editor", "intention", "inspection",
535-
"psiclass", "psimethod", "psifield", "psitype", "psivariable",
536-
"language", "filetype", "module", "facet", "artifact",
537-
"toolwindow", "notification", "progress", "indicator",
538-
"runnable", "callable", "future", "promise", "deferred",
539-
// JetBrains specific
614+
"anaction", "psifile", "psielement", "psiclass", "psimethod",
615+
"psifield", "psitype", "psivariable", "virtualfile",
616+
// JetBrains UI components
540617
"jbcolor", "jbinsets", "jbui", "jbpopup", "jblist",
541-
// Java Swing/AWT
542-
"jcomponent", "jpanel", "jbutton", "jlabel", "jframe",
543-
"swing", "awt", "graphics"
618+
// Java Swing/AWT internals
619+
"jcomponent", "jpanel", "jbutton", "jlabel", "jframe"
544620
)
545621
if (platformTerms.any { lowerName.contains(it) }) return false
546622

547-
// Skip technical suffixes that indicate infrastructure
623+
// Skip pure infrastructure suffixes
548624
val technicalSuffixes = setOf(
549-
"controller", "service", "repository", "dao", "mapper",
550-
"dto", "vo", "po", "entity", "request", "response",
551-
"config", "configuration", "settings", "properties",
552-
"handler", "listener", "callback", "adapter", "wrapper",
553-
"factory", "builder", "provider", "manager", "registry",
554-
"helper", "util", "utils", "tool", "tools",
555-
"impl", "implementation", "abstract", "base", "default",
556-
"exception", "error", "filter", "interceptor",
557-
"capable", "aware", "enabled", "disabled"
625+
"impl", "implementation", "dto", "vo", "po"
558626
)
559627
if (technicalSuffixes.any { lowerName.endsWith(it) }) return false
560628

@@ -588,29 +656,27 @@ class DomainDictAgent(
588656

589657
val namesList = names.joinToString("\n") { "- $it" }
590658

591-
// DDD-focused prompt, inspired by indexer.vm
659+
// DDD-focused prompt - extract compound domain concepts only
592660
val prompt = """
593-
你是一个 DDD(领域驱动设计)专家,负责构建业务导向的中英文词典。请从以下代码名称中提取重要的业务概念
661+
你是一个 DDD(领域驱动设计)专家,负责构建业务导向的中英文词典。请从以下代码名称中提取**复合业务概念**
594662
595-
**提取原则:**
663+
**核心规则:只提取复合词(至少包含2个有意义的单词)**
596664
597-
✅ 应该提取的内容:
598-
- 核心业务实体(如:Blog、Comment、Payment、User 等名词)
599-
- 业务概念和领域模型(如:Member、Points、Order)
600-
- 难以理解的词汇或拼音缩写
601-
- 领域特定术语
665+
✅ 应该提取的内容(复合词示例)
666+
- DomainDict(领域词典)- 由 Domain + Dict 组成
667+
- CodeReview(代码审查)- 由 Code + Review 组成
668+
- ChatContext(聊天上下文)- 由 Chat + Context 组成
669+
- AgentTask(代理任务)- 由 Agent + Task 组成
602670
603-
❌ 应该排除的内容:
604-
1. 技术词汇:Controller、Service、Repository、Mapper、DTO、VO、PO、Entity、Request、Response、Config 等
605-
2. 实现细节和数据传输对象:包含 "Request"、"Response"、"Dto"、"Entity" 后缀的条目
606-
3. 技术操作动词:validate、check、convert、deserialize、serialize、encode、decode 等
607-
4. 方法名中的技术操作:如 "checkIfVipAccount" 应只提取 "VIP Account"
608-
5. 通用库 API(如 Spring、OkHttp)和通用类名(如 List、Map)
671+
❌ 绝对不要提取的内容(单个通用词):
672+
- Agent、Chat、Code、Task、Model、Service、Config、Handler、Manager
673+
- File、Path、Node、Item、Event、Action、State、Context、Message
674+
- User、Role、Session、Token、Request、Response、Error、Result
675+
- 任何只有一个单词的通用技术术语
609676
610-
**处理规则:**
611-
1. 如果提取的条目包含技术后缀(如 "CreateCommentDto"),转换为纯业务概念(如 "Comment")
612-
2. 如果方法名包含技术操作(如 "checkIfVipAccount"),提取业务含义("VIP Account")
613-
3. 如果类名包含技术词汇后缀,移除后缀再添加到词典
677+
❌ 也要排除:
678+
1. 技术后缀词:Controller、Service、Repository、Mapper、DTO、Handler 等
679+
2. 通用库 API 和框架类名
614680
615681
## 要分析的名称:
616682
$namesList
@@ -619,17 +685,17 @@ $namesList
619685
```json
620686
{
621687
"entries": [
622-
{"chinese": "博客", "codeTranslation": "Blog", "description": "博客文章"}
688+
{"chinese": "领域词典", "codeTranslation": "DomainDict", "description": "业务术语词典"}
623689
]
624690
}
625691
```
626692
627693
## 输出规则:
628-
1. chinese: 简洁的中文术语(2-6个字
629-
2. codeTranslation: 纯业务概念名(移除技术后缀
630-
3. description: 一句话业务描述(不超过20字)
631-
4. 只输出有意义的业务概念,跳过技术实现细节
632-
5. 如果无法理解或太通用,直接跳过不输出
694+
1. codeTranslation 必须是**复合词**(包含至少2个大写字母开头的单词
695+
2. 不要拆分复合词!保持原样(如 AgentTask 不要拆成 Agent 和 Task
696+
3. 如果输入是单个通用词,直接跳过不输出
697+
4. chinese: 简洁的中文术语(2-6个字)
698+
5. description: 一句话业务描述(不超过20字)
633699
634700
请直接输出JSON,不要其他解释。
635701
""".trimIndent()
@@ -674,14 +740,32 @@ $namesList
674740
val code = match.groupValues[2].trim()
675741
val desc = match.groupValues[3].trim()
676742

677-
if (chinese.isNotBlank() && code.isNotBlank()) {
743+
if (chinese.isNotBlank() && code.isNotBlank() && isValidOutputEntry(code)) {
678744
entries.add(DomainEntry(chinese, code, desc))
679745
}
680746
}
681747

682748
return entries
683749
}
684750

751+
/**
752+
* Validate LLM output entries - filter out pure technical infrastructure words
753+
*/
754+
private fun isValidOutputEntry(code: String): Boolean {
755+
// Must have at least 2 capital letters (compound word)
756+
val capitalCount = code.count { it.isUpperCase() }
757+
if (capitalCount < 2) return false
758+
759+
// Skip only pure technical terms (let AI decide business relevance)
760+
val technicalSkip = setOf(
761+
"impl", "util", "utils", "helper", "helpers",
762+
"test", "tests", "spec", "mock", "stub", "fake"
763+
)
764+
if (code.lowercase() in technicalSkip) return false
765+
766+
return true
767+
}
768+
685769
// ============= Step 3: Save =============
686770

687771
private fun parseExistingTerms(csv: String): Set<String> {

0 commit comments

Comments
 (0)