Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import com.alibaba.assistant.agent.evaluation.model.EvaluationCriterion;
import com.alibaba.assistant.agent.evaluation.model.EvaluationSuite;
import com.alibaba.assistant.agent.evaluation.model.ExecutionContextFactory;
import com.alibaba.assistant.agent.evaluation.util.EvaluationLogContextHelper;
import com.alibaba.cloud.ai.graph.OverAllState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -92,7 +93,8 @@ public Map<String, Object> apply(OverAllState state) {
throw new IllegalStateException("Required components not found in OverAllState");
}

logger.info("Executing criterion: {}", criterion.getName());
String sessionId = EvaluationLogContextHelper.getSessionId(evaluationContext);
logger.info("Executing criterion: {}, sessionId={}", criterion.getName(), sessionId);

// Build dependency results map from individual state keys
Map<String, CriterionResult> dependencyResults = buildDependencyResults(state);
Expand All @@ -103,10 +105,10 @@ public Map<String, Object> apply(OverAllState state) {
CriterionResult skipResult = checkConditionalExecution(conditionalConfig, dependencyResults);
if (skipResult != null) {
// Condition not met, return skipped result
logger.info("Criterion '{}' skipped: {}", criterion.getName(), conditionalConfig.getSkipReason());
logger.info("Criterion '{}' skipped, sessionId={}: {}", criterion.getName(), sessionId, conditionalConfig.getSkipReason());
return buildSkippedResultUpdates(skipResult);
}
logger.debug("Conditional execution check passed for criterion: {}", criterion.getName());
logger.debug("Conditional execution check passed for criterion: {}, sessionId={}", criterion.getName(), sessionId);
}

// Determine timeout: criterion-specific > suite default
Expand Down Expand Up @@ -139,18 +141,18 @@ public Map<String, Object> apply(OverAllState state) {
} else {
// No executor or timeout disabled, execute directly
if (batchingEnabled) {
logger.debug("Batching enabled for criterion: {}", criterion.getName());
logger.debug("Batching enabled for criterion: {}, sessionId={}", criterion.getName(), sessionId);
result = executeWithBatching(evaluationContext, dependencyResults, batchingConfig);
} else {
logger.debug("Batching not enabled for criterion: {}", criterion.getName());
logger.debug("Batching not enabled for criterion: {}, sessionId={}", criterion.getName(), sessionId);
result = executeWithoutBatching(evaluationContext, dependencyResults);
}
}
} catch (TimeoutException te) {
// Criterion timed out - return timeout result with default value
long elapsedMs = System.currentTimeMillis() - startTime;
logger.warn("Criterion '{}' timed out after {}ms (timeout={}ms), using default value: {}",
criterion.getName(), elapsedMs, timeoutMs, criterion.getDefaultValue());
logger.warn("Criterion '{}' timed out after {}ms (timeout={}ms), sessionId={}, using default value: {}",
criterion.getName(), elapsedMs, timeoutMs, sessionId, criterion.getDefaultValue());

result = buildTimeoutResult(startTime, timeoutMs);
}
Expand All @@ -171,11 +173,11 @@ public Map<String, Object> apply(OverAllState state) {
com.alibaba.assistant.agent.evaluation.observation.EvaluationObservationLifecycleListener
.registerCriterionResult(criterion.getName(), result);

logger.info("Criterion {} completed with status: {}, value: {}",
criterion.getName(), result.getStatus(), result.getValue());
logger.info("Criterion {} completed, sessionId={}, status={}, result={}",
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The log message labels the last field as result={}, but the code passes result.getValue(). This is confusing when debugging; either log the full CriterionResult (if safe) or rename the label to value={}.

Suggested change
logger.info("Criterion {} completed, sessionId={}, status={}, result={}",
logger.info("Criterion {} completed, sessionId={}, status={}, value={}",

Copilot uses AI. Check for mistakes.
criterion.getName(), sessionId, result.getStatus(), result.getValue());

} catch (Exception e) {
logger.error("Error executing criterion {}: {}", criterion.getName(), e.getMessage(), e);
logger.error("Error executing criterion {}, sessionId={}: {}", criterion.getName(), extractSessionIdFromState(state), e.getMessage(), e);

// Create error result with default value if available
CriterionResult errorResult = buildErrorResult(startTime, e.getMessage());
Expand Down Expand Up @@ -259,6 +261,14 @@ private Map<String, CriterionResult> buildDependencyResults(OverAllState state)
return dependencyResults;
}

private String extractSessionIdFromState(OverAllState state) {
Object evaluationContext = state != null ? state.data().get("evaluationContext") : null;
if (evaluationContext instanceof EvaluationContext context) {
return EvaluationLogContextHelper.getSessionId(context);
}
return null;
}

/**
* Execute criterion without batching (original logic).
*/
Expand Down Expand Up @@ -298,23 +308,27 @@ private CriterionResult executeWithBatching(EvaluationContext evaluationContext,

// Check if source is a collection
if (!SourcePathResolver.isCollection(sourceObject)) {
logger.warn("Source path '{}' did not resolve to a collection for criterion '{}', falling back to non-batching execution",
batchingConfig.getSourcePath(), criterion.getName());
logger.warn("Source path '{}' did not resolve to a collection for criterion '{}', sessionId={}, falling back to non-batching execution",
batchingConfig.getSourcePath(), criterion.getName(), EvaluationLogContextHelper.getSessionId(evaluationContext));
return executeWithoutBatching(evaluationContext, dependencyResults);
}

Collection<?> sourceCollection = SourcePathResolver.toCollection(sourceObject);
if (sourceCollection == null || sourceCollection.isEmpty()) {
logger.debug("Source collection is empty for criterion '{}', returning empty result", criterion.getName());
logger.debug("Source collection is empty for criterion '{}', sessionId={}, returning empty result",
criterion.getName(), EvaluationLogContextHelper.getSessionId(evaluationContext));
return createEmptyCollectionResult();
}

logger.info("Criterion '{}': processing {} items with batchSize={}, maxConcurrentBatches={}",
criterion.getName(), sourceCollection.size(), batchingConfig.getBatchSize(), batchingConfig.getMaxConcurrentBatches());
logger.info("Criterion '{}': processing {} items with batchSize={}, maxConcurrentBatches={}, sessionId={}",
criterion.getName(), sourceCollection.size(), batchingConfig.getBatchSize(), batchingConfig.getMaxConcurrentBatches(),
EvaluationLogContextHelper.getSessionId(evaluationContext));

// Split into batches
List<List<Object>> batches = splitIntoBatches(sourceCollection, batchingConfig.getBatchSize());
logger.debug("Split {} items into {} batches", sourceCollection.size(), batches.size());
logger.debug("Split {} items into {} batches, criterion={}, sessionId={}",
sourceCollection.size(), batches.size(), criterion.getName(),
EvaluationLogContextHelper.getSessionId(evaluationContext));

// Process batches
List<CriterionResult> allBatchResults = processBatches(
Expand All @@ -328,7 +342,8 @@ private CriterionResult executeWithBatching(EvaluationContext evaluationContext,
return aggregateResults(evaluationContext, dependencyResults, batchingConfig, allBatchResults);

} catch (Exception e) {
logger.error("Error during batching execution for criterion '{}': {}", criterion.getName(), e.getMessage(), e);
logger.error("Error during batching execution for criterion '{}', sessionId={}: {}",
criterion.getName(), EvaluationLogContextHelper.getSessionId(evaluationContext), e.getMessage(), e);
CriterionResult errorResult = new CriterionResult();
errorResult.setCriterionName(criterion.getName());
errorResult.setStatus(CriterionStatus.ERROR);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package com.alibaba.assistant.agent.evaluation.observation;

import com.alibaba.assistant.agent.evaluation.model.CriterionResult;
import com.alibaba.assistant.agent.evaluation.util.EvaluationLogContextHelper;
import com.alibaba.cloud.ai.graph.GraphLifecycleListener;
import com.alibaba.cloud.ai.graph.RunnableConfig;
import io.opentelemetry.api.trace.Span;
Expand Down Expand Up @@ -164,21 +165,24 @@ public EvaluationObservationLifecycleListener(Tracer tracer, Span parentSpan) {
public void onStart(String nodeId, Map<String, Object> state, RunnableConfig config) {
// 评估 Graph 开始时不需要特殊处理
if (nodeId != null && nodeId.equalsIgnoreCase("__start__")) {
log.debug("EvaluationObservationLifecycleListener#onStart - reason=评估Graph开始");
log.debug("EvaluationObservationLifecycleListener#onStart - reason=评估Graph开始, sessionId={}",
EvaluationLogContextHelper.getSessionId(config));
}
}

@Override
public void onComplete(String nodeId, Map<String, Object> state, RunnableConfig config) {
// 评估 Graph 完成时不需要特殊处理
if (nodeId != null && nodeId.equalsIgnoreCase("__end__")) {
log.debug("EvaluationObservationLifecycleListener#onComplete - reason=评估Graph完成");
log.debug("EvaluationObservationLifecycleListener#onComplete - reason=评估Graph完成, sessionId={}",
EvaluationLogContextHelper.getSessionId(config));
}
}

@Override
public void onError(String nodeId, Map<String, Object> state, Throwable ex, RunnableConfig config) {
log.error("EvaluationObservationLifecycleListener#onError - reason=评估执行出错, nodeId={}", nodeId, ex);
log.error("EvaluationObservationLifecycleListener#onError - reason=评估执行出错, nodeId={}, sessionId={}",
nodeId, EvaluationLogContextHelper.getSessionId(config), ex);

// 停止对应节点的 Span
String nodeKey = getNodeKey(nodeId, config);
Expand Down Expand Up @@ -223,7 +227,8 @@ public void before(String nodeId, Map<String, Object> state, RunnableConfig conf
nodeSpans.put(nodeKey, span);
nodeScopes.put(nodeKey, scope);

log.debug("EvaluationObservationLifecycleListener#before - reason=评估项开始执行, criterionName={}", nodeId);
log.debug("EvaluationObservationLifecycleListener#before - reason=评估项开始执行, criterionName={}, sessionId={}",
nodeId, EvaluationLogContextHelper.getSessionId(config));
}

/**
Expand Down Expand Up @@ -309,10 +314,11 @@ public void after(String nodeId, Map<String, Object> state, RunnableConfig confi
}

log.info("EvaluationObservationLifecycleListener#after - reason=评估项执行完成, " +
"criterionName={}, status={}, durationMs={}",
nodeId, result.getStatus(), durationMs);
"criterionName={}, sessionId={}, status={}, durationMs={}",
nodeId, EvaluationLogContextHelper.getSessionId(config), result.getStatus(), durationMs);
} else {
log.warn("EvaluationObservationLifecycleListener#after - reason=评估结果未找到, criterionName={}", nodeId);
log.warn("EvaluationObservationLifecycleListener#after - reason=评估结果未找到, criterionName={}, sessionId={}",
nodeId, EvaluationLogContextHelper.getSessionId(config));
}

span.end();
Expand Down Expand Up @@ -360,4 +366,3 @@ private String truncate(String str, int maxLength) {
return str.substring(0, maxLength) + "...[truncated]";
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.alibaba.assistant.agent.evaluation.util;

import com.alibaba.assistant.agent.evaluation.model.EvaluationContext;
import com.alibaba.cloud.ai.graph.RunnableConfig;

/**
* 评估链路日志上下文提取工具。
*/
public final class EvaluationLogContextHelper {

private EvaluationLogContextHelper() {
}

public static String getSessionId(EvaluationContext context) {
if (context == null) {
return null;
}
Object sessionId = context.getEnvironmentValue("sessionId");
if (sessionId == null) {
sessionId = context.getEnvironmentValue("threadId");
}
return sessionId != null ? String.valueOf(sessionId) : null;
}

public static String getSessionId(RunnableConfig config) {
if (config == null) {
return null;
}
return config.threadId().orElse(null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import com.alibaba.assistant.agent.evaluation.model.CriterionExecutionContext;
import com.alibaba.assistant.agent.evaluation.model.CriterionResult;
import com.alibaba.assistant.agent.evaluation.model.CriterionStatus;
import com.alibaba.assistant.agent.evaluation.util.EvaluationLogContextHelper;
import com.alibaba.assistant.agent.extension.experience.model.Experience;
import com.alibaba.assistant.agent.extension.experience.model.ExperienceQuery;
import com.alibaba.assistant.agent.extension.experience.model.ExperienceQueryContext;
Expand Down Expand Up @@ -114,23 +115,29 @@ private static RuleBasedEvaluator createEvaluator(
try {
// 优先使用 enhanced_user_input,否则使用原始 userInput
String queryText = extractEnhancedOrOriginalInput(ctx);
String sessionId = EvaluationLogContextHelper.getSessionId(ctx.getInputContext());
log.info("ExperienceRetrievalEvaluatorFactory#{} - reason=开始检索经验, sessionId={}, queryText={}, experienceTypes={}",
evaluatorId, sessionId, queryText, experienceTypes);
Comment on lines +119 to +120
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

queryText appears to be derived from end-user input, but it’s currently logged at INFO level. This can leak PII/secrets into logs. Consider removing it, logging only its length / a truncated value, or downgrading to DEBUG with truncation/redaction.

Suggested change
log.info("ExperienceRetrievalEvaluatorFactory#{} - reason=开始检索经验, sessionId={}, queryText={}, experienceTypes={}",
evaluatorId, sessionId, queryText, experienceTypes);
int queryTextLength = StringUtils.hasLength(queryText) ? queryText.length() : 0;
log.info("ExperienceRetrievalEvaluatorFactory#{} - reason=开始检索经验, sessionId={}, queryTextLength={}, experienceTypes={}",
evaluatorId, sessionId, queryTextLength, experienceTypes);

Copilot uses AI. Check for mistakes.

List<Experience> experiences = queryExperiencesByStringIntersection(
experienceProvider,
ctx,
queryText,
experienceTypes,
maxExperiencesPerType
);

if (experiences.isEmpty()) {
log.info("ExperienceRetrievalEvaluatorFactory#{} - reason=未检索到经验", evaluatorId);
log.info("ExperienceRetrievalEvaluatorFactory#{} - reason=未检索到经验, sessionId={}", evaluatorId, sessionId);
result.setStatus(CriterionStatus.SUCCESS);
result.setValue("");
result.getMetadata().put("is_empty", true);
return result;
}

log.info("ExperienceRetrievalEvaluatorFactory#{} - reason=检索到经验, count={}", evaluatorId, experiences.size());
log.info("ExperienceRetrievalEvaluatorFactory#{} - reason=检索到经验, sessionId={}, count={}, experienceIds={}",
evaluatorId, sessionId, experiences.size(),
experiences.stream().map(Experience::getId).collect(Collectors.toList()));

// 构建 ref_entries,每个经验作为一个独立的条目,experience ID 作为 ref-id
List<Map<String, String>> refEntries = buildRefEntries(experiences);
Expand All @@ -142,7 +149,8 @@ private static RuleBasedEvaluator createEvaluator(
result.getMetadata().put("experience_count", experiences.size());
Comment on lines 138 to 149
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

experiences.stream().map(...).collect(toList()) is computed multiple times (for logging and for metadata). Consider computing experienceIds once and reusing it to avoid duplicate work and ensure the log/metadata always refer to the same list.

Copilot uses AI. Check for mistakes.

} catch (Exception e) {
log.error("ExperienceRetrievalEvaluatorFactory#{} - reason=经验检索失败", evaluatorId, e);
log.error("ExperienceRetrievalEvaluatorFactory#{} - reason=经验检索失败, sessionId={}",
evaluatorId, EvaluationLogContextHelper.getSessionId(ctx.getInputContext()), e);
result.setStatus(CriterionStatus.ERROR);
result.setErrorMessage(e.getMessage());
}
Expand Down Expand Up @@ -181,6 +189,7 @@ private static String extractEnhancedOrOriginalInput(CriterionExecutionContext c
*/
private static List<Experience> queryExperiencesByStringIntersection(
ExperienceProvider experienceProvider,
CriterionExecutionContext executionContext,
String queryText,
List<ExperienceType> types,
int maxExperiencesPerType) {
Expand All @@ -191,6 +200,20 @@ private static List<Experience> queryExperiencesByStringIntersection(

ExperienceQueryContext queryContext = new ExperienceQueryContext();
queryContext.setUserQuery(queryText);
if (executionContext != null && executionContext.getInputContext() != null) {
Object sessionId = executionContext.getInputContext().getEnvironmentValue("sessionId");
if (sessionId != null) {
queryContext.setSessionId(String.valueOf(sessionId));
Comment on lines +204 to +206
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Session ID propagation into ExperienceQueryContext only checks environment.sessionId, but EvaluationLogContextHelper also falls back to environment.threadId. This can cause logs to show a non-null sessionId while downstream experience queries receive a null sessionId. Consider using EvaluationLogContextHelper.getSessionId(executionContext.getInputContext()) (or duplicating the same fallback logic) when setting queryContext.sessionId.

Suggested change
Object sessionId = executionContext.getInputContext().getEnvironmentValue("sessionId");
if (sessionId != null) {
queryContext.setSessionId(String.valueOf(sessionId));
String sessionId = EvaluationLogContextHelper.getSessionId(executionContext.getInputContext());
if (StringUtils.hasText(sessionId)) {
queryContext.setSessionId(sessionId);

Copilot uses AI. Check for mistakes.
}
Object tenantId = executionContext.getInputContext().getEnvironmentValue("tenantId");
if (tenantId != null) {
queryContext.setTenantId(String.valueOf(tenantId));
}
Object userId = executionContext.getInputContext().getEnvironmentValue("userId");
if (userId != null) {
queryContext.setUserId(String.valueOf(userId));
}
}
Comment on lines 201 to +216
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New behavior propagates sessionId/tenantId/userId from CriterionExecutionContext into ExperienceQueryContext, but there’s no test asserting that these fields are passed through to ExperienceProvider.query(...). Since this module already has unit tests around ExperienceQueryContext, consider adding a focused test that mocks ExperienceProvider and verifies the constructed ExperienceQueryContext contains the expected IDs.

Copilot uses AI. Check for mistakes.

List<Experience> allExperiences = new ArrayList<>();

Expand Down Expand Up @@ -318,4 +341,3 @@ private static String formatExperiences(List<Experience> experiences, String pha
return sb.toString();
}
}

Loading
Loading