diff --git a/src/promptflow-core/promptflow/_utils/logger_utils.py b/src/promptflow-core/promptflow/_utils/logger_utils.py index fca4c9c0a7e..d8d755ae9f3 100644 --- a/src/promptflow-core/promptflow/_utils/logger_utils.py +++ b/src/promptflow-core/promptflow/_utils/logger_utils.py @@ -207,6 +207,19 @@ def get_logger(name: str) -> logging.Logger: service_logger = get_logger("execution.service") +def update_logger_levels(log_level: Optional[str] = None) -> None: + """ + Update the logger levels. + + :param log_level: The new logging level. If it is None, + logging level will be taken from + using get_pf_logging_level. + :type log_level: Optional[str] + """ + for log in [flow_logger, bulk_logger, logger, service_logger]: + log.setLevel(log_level or get_pf_logging_level()) + + logger_contexts = [] diff --git a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py index d3241495733..cf3b5c2b878 100644 --- a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py +++ b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py @@ -8,7 +8,7 @@ from promptflow._utils.dataclass_serializer import convert_eager_flow_output_to_dict from promptflow._utils.flow_utils import dump_flow_result, is_executable_chat_flow -from promptflow._utils.logger_utils import LoggerFactory +from promptflow._utils.logger_utils import LoggerFactory, get_pf_logging_level from promptflow._utils.multimedia_utils import MultimediaProcessor from promptflow.core._connection import _Connection from promptflow.core._connection_provider._connection_provider import ConnectionProvider @@ -59,7 +59,10 @@ def __init__( init_kwargs: dict = None, **kwargs, ): - self.logger = kwargs.get("logger", LoggerFactory.get_logger("flowinvoker")) + self.logger = kwargs.get( + "logger", + LoggerFactory.get_logger("flowinvoker", + verbosity=kwargs.get('log_level') or get_pf_logging_level())) self._init_kwargs = init_kwargs or {} self.logger.debug(f"Init flow invoker with init kwargs: {self._init_kwargs}") # TODO: avoid to use private attribute after we finalize the inheritance @@ -123,7 +126,8 @@ def _init_connections(self, connection_provider): connection_names=self.flow.get_connection_names( environment_variables_overrides=os.environ, ), - provider=ConnectionProvider.init_from_provider_config(connection_provider, credential=self._credential), + provider=ConnectionProvider.init_from_provider_config( + connection_provider, credential=self._credential), connections_to_ignore=connections_to_ignore, # fetch connections with name override connections_to_add=list(self.connections_name_overrides.values()), diff --git a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py index 59e347fbb6a..7c0c4be3f66 100644 --- a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py +++ b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py @@ -6,7 +6,7 @@ from functools import lru_cache from os import PathLike from pathlib import Path -from typing import Dict, Union +from typing import Dict, Optional, Union from promptflow._sdk._configuration import Configuration from promptflow._sdk._constants import NODES @@ -37,19 +37,20 @@ def __init__(self, flow_path: PathLike): @classmethod @lru_cache - def resolve(cls, flow: Flow) -> "FlowInvoker": + def resolve(cls, flow: Flow, log_level: Optional[int] = None) -> "FlowInvoker": """Resolve flow to flow invoker.""" resolver = cls(flow_path=flow.path) resolver._resolve(flow_context=flow.context) - return resolver._create_invoker(flow_context=flow.context) + return resolver._create_invoker(flow_context=flow.context, log_level=log_level) @classmethod @lru_cache - def resolve_async_invoker(cls, flow: Flow) -> "AsyncFlowInvoker": + def resolve_async_invoker(cls, flow: Flow, log_level: Optional[int] = None) -> "AsyncFlowInvoker": """Resolve flow to flow invoker.""" resolver = cls(flow_path=flow.path) resolver._resolve(flow_context=flow.context) - return resolver._create_invoker(flow_context=flow.context, is_async_call=True) + return resolver._create_invoker(flow_context=flow.context, is_async_call=True, + log_level=log_level) def _resolve(self, flow_context: FlowContext): """Resolve flow context.""" @@ -113,7 +114,8 @@ def _resolve_connection_objs(self, flow_context: FlowContext): return connections def _create_invoker( - self, flow_context: FlowContext, is_async_call=False + self, flow_context: FlowContext, is_async_call=False, + log_level: Optional[int] = None ) -> Union["FlowInvoker", "AsyncFlowInvoker"]: from promptflow.core._serving.flow_invoker import AsyncFlowInvoker, FlowInvoker @@ -132,6 +134,7 @@ def _create_invoker( flow=resolved_flow, connections=connections, streaming=flow_context.streaming, + log_level=log_level, ) else: return FlowInvoker( @@ -139,4 +142,5 @@ def _create_invoker( connections=connections, streaming=flow_context.streaming, connection_provider=Configuration.get_instance().get_connection_provider(), + log_level=log_level, ) diff --git a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py index 900b4ea7497..13214cfc3c7 100644 --- a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py +++ b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py @@ -9,6 +9,7 @@ from promptflow._constants import DEFAULT_ENCODING, FLOW_FILE_SUFFIX from promptflow._sdk.entities._validation import SchemaValidatableMixin from promptflow._utils.flow_utils import is_flex_flow, is_prompty_flow, resolve_flow_path +from promptflow._utils.logger_utils import update_logger_levels from promptflow._utils.yaml_utils import load_yaml_string from promptflow.core._flow import AbstractFlowBase from promptflow.exceptions import UserErrorException @@ -145,6 +146,7 @@ def __init__( **kwargs, ): self.variant = kwargs.pop("variant", None) or {} + self._log_level = kwargs.pop("log_level", None) super().__init__(data=dag, code=code, path=path, **kwargs) @property @@ -236,6 +238,8 @@ def __call__(self, *args, **kwargs): if args: raise UserErrorException("Flow can only be called with keyword arguments.") + if self._log_level: + update_logger_levels(self._log_level) result = self.invoke(inputs=kwargs) return result.output @@ -243,7 +247,7 @@ def invoke(self, inputs: dict) -> "LineResult": """Invoke a flow and get a LineResult object.""" from promptflow._sdk.entities._flows._flow_context_resolver import FlowContextResolver - invoker = FlowContextResolver.resolve(flow=self) + invoker = FlowContextResolver.resolve(flow=self, log_level=self._log_level) result = invoker._invoke( data=inputs, ) diff --git a/src/promptflow-devkit/tests/sdk_cli_test/conftest.py b/src/promptflow-devkit/tests/sdk_cli_test/conftest.py index cab32c4a106..57a81f3f90c 100644 --- a/src/promptflow-devkit/tests/sdk_cli_test/conftest.py +++ b/src/promptflow-devkit/tests/sdk_cli_test/conftest.py @@ -8,7 +8,7 @@ import pytest from _constants import CONNECTION_FILE, PROMPTFLOW_ROOT from fastapi.testclient import TestClient -from mock import mock +from unittest import mock from pytest_mock import MockerFixture from sqlalchemy import create_engine diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py index ac3d53f1dd8..548028d7f98 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- import inspect from types import FunctionType -from typing import Callable, Dict, Optional +from typing import Callable, Dict, Optional, Union, cast import pandas as pd @@ -103,23 +103,27 @@ def evaluate( code_client = CodeClient() evaluator_info = {} - - for evaluator_name, evaluator in evaluators.items(): - if isinstance(evaluator, FunctionType): - evaluator_info.update({evaluator_name: {"client": pf_client, "evaluator": evaluator}}) - else: - evaluator_info.update({evaluator_name: {"client": code_client, "evaluator": evaluator}}) - - evaluator_info[evaluator_name]["run"] = evaluator_info[evaluator_name]["client"].run( - flow=evaluator, - column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)), - data=data, - stream=True, - ) + if evaluator_config is None: + evaluator_config = {} + + if evaluators: + for evaluator_name, evaluator in evaluators.items(): + if isinstance(evaluator, FunctionType): + evaluator_info.update({evaluator_name: {"client": pf_client, "evaluator": evaluator}}) + else: + evaluator_info.update({evaluator_name: {"client": code_client, "evaluator": evaluator}}) + + evaluator_info[evaluator_name]["run"] = evaluator_info[evaluator_name]["client"].run( + flow=evaluator, + column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)), + data=data, + stream=True, + ) evaluators_result_df = None for evaluator_name, evaluator_info in evaluator_info.items(): - evaluator_result_df = evaluator_info["client"].get_details(evaluator_info["run"], all_results=True) + evaluator_result_df = cast( + Union[PFClient, CodeClient], evaluator_info["client"]).get_details(evaluator_info["run"], all_results=True) # drop input columns evaluator_result_df = evaluator_result_df.drop( diff --git a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py index e2fc2b8066a..0dcfde1b6e5 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py @@ -7,7 +7,7 @@ import json import logging from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Dict, List +from typing import Any, Dict, List, Optional import numpy as np @@ -18,7 +18,8 @@ class ChatEvaluator: def __init__( - self, model_config, eval_last_turn: bool = False, parallel: bool = True + self, model_config, eval_last_turn: bool = False, parallel: bool = True, + log_level: Optional[int] = None ): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -55,12 +56,12 @@ def __init__( # TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection self._rag_evaluators = [ - GroundednessEvaluator(model_config), - RelevanceEvaluator(model_config), + GroundednessEvaluator(model_config, log_level=log_level), + RelevanceEvaluator(model_config, log_level=log_level), ] self._non_rag_evaluators = [ - CoherenceEvaluator(model_config), - FluencyEvaluator(model_config), + CoherenceEvaluator(model_config, log_level=log_level), + FluencyEvaluator(model_config, log_level=log_level), ] def __call__(self, *, conversation, **kwargs): @@ -167,8 +168,8 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator): return {} def _aggregate_results(self, per_turn_results: List[Dict]): - scores = {} - reasons = {} + scores: Dict[str, Any] = {} + reasons: Dict[str, Any] = {} for turn in per_turn_results: for metric, value in turn.items(): @@ -181,7 +182,7 @@ def _aggregate_results(self, per_turn_results: List[Dict]): scores[metric] = [] scores[metric].append(value) - aggregated = {} + aggregated: Dict[str, Any] = {} evaluation_per_turn = {} for metric, values in scores.items(): diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py index 023a52845d8..7df36b0e105 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py @@ -4,6 +4,8 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from pathlib import Path from promptflow.client import load_flow @@ -11,12 +13,15 @@ class CoherenceEvaluator: - def __init__(self, model_config): + def __init__(self, model_config, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. :param model_config: Configuration for the Azure OpenAI model. :type model_config: AzureOpenAIModelConfiguration + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -31,7 +36,7 @@ def __init__(self, model_config): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection connection = convert_model_configuration_to_connection(model_config) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py index 0b93be5ce38..e2433e610b0 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py @@ -1,7 +1,7 @@ from promptflow.core import tool import time from constants import RAIService, Tasks -from typing import List +from typing import Any, Dict, List import numpy as np import re from utils import get_harm_severity_level @@ -68,7 +68,7 @@ def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredentia time.sleep(sleep_time) -def parse_response(batch_response: List[dict], metric_name: str) -> List[List[dict]]: +def parse_response(batch_response: List[dict], metric_name: str) -> Dict[str, Any]: # Fix the metric name if it's "hate_fairness" # Eventually we will remove this fix once the RAI service is updated key = metric_name diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py index f9a035158bf..ff4fb69df44 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py @@ -1,8 +1,10 @@ import constants import numpy as np +from typing import Union -def get_harm_severity_level(harm_score: int) -> str: + +def get_harm_severity_level(harm_score: int) -> Union[str, float]: HARM_SEVERITY_LEVEL_MAPPING = {constants.HarmSeverityLevel.Safe: [0, 1], constants.HarmSeverityLevel.Low: [2, 3], constants.HarmSeverityLevel.Medium: [4, 5], diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py index 323df141d2e..897922f36e8 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py @@ -1,10 +1,13 @@ +from typing import Optional + from promptflow.client import load_flow from pathlib import Path from promptflow.evals._constants import EvaluationMetrics class HateUnfairnessEvaluator: - def __init__(self, project_scope: dict, credential=None): + def __init__(self, project_scope: dict, credential=None, + log_level: Optional[int] = None): """ Initialize an evaluator for hate unfairness score. @@ -13,6 +16,8 @@ def __init__(self, project_scope: dict, credential=None): :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -28,7 +33,7 @@ def __init__(self, project_scope: dict, credential=None): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates hate unfairness content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py index b2ff8554bfd..762ea5da423 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py @@ -1,10 +1,13 @@ +from typing import Optional + from promptflow.client import load_flow from pathlib import Path from promptflow.evals._constants import EvaluationMetrics class SelfHarmEvaluator: - def __init__(self, project_scope: dict, credential=None): + def __init__(self, project_scope: dict, credential=None, + log_level: Optional[int] = None): """ Initialize an evaluator for self harm score. @@ -13,6 +16,8 @@ def __init__(self, project_scope: dict, credential=None): :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -28,7 +33,7 @@ def __init__(self, project_scope: dict, credential=None): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates self harm content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py index b1247369cdc..cdcf80f0d56 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py @@ -1,10 +1,13 @@ +from typing import Optional + from promptflow.client import load_flow from pathlib import Path from promptflow.evals._constants import EvaluationMetrics class SexualEvaluator: - def __init__(self, project_scope: dict, credential=None): + def __init__(self, project_scope: dict, credential=None, + log_level: Optional[int] = None): """ Initialize an evaluator for sexual score. @@ -13,6 +16,8 @@ def __init__(self, project_scope: dict, credential=None): :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -28,7 +33,7 @@ def __init__(self, project_scope: dict, credential=None): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates sexual content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py index 29bc631c866..8e1fb37daef 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py @@ -1,10 +1,13 @@ +from typing import Optional + from promptflow.client import load_flow from pathlib import Path from promptflow.evals._constants import EvaluationMetrics class ViolenceEvaluator: - def __init__(self, project_scope: dict, credential=None): + def __init__(self, project_scope: dict, credential=None, + log_level: Optional[int] = None): """ Initialize an evaluator for violence score. @@ -13,6 +16,8 @@ def __init__(self, project_scope: dict, credential=None): :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -28,7 +33,7 @@ def __init__(self, project_scope: dict, credential=None): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates violence content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py index 2372e98cc72..d2baaf281c9 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py @@ -4,15 +4,20 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.client import load_flow from pathlib import Path class F1ScoreEvaluator: - def __init__(self): + def __init__(self, log_level: Optional[int] = None) -> None: """ Initialize an evaluator for calculating F1 score. + :param log_level: The logging level. + :type log_level: Optional[int] + **Usage** .. code-block:: python @@ -27,7 +32,7 @@ def __init__(self): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, answer: str, ground_truth: str, **kwargs): """Evaluate F1 score. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py index 08c6ad25677..5eb4ee5e87e 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py @@ -3,7 +3,7 @@ @tool -def compute_f1_score(answer: str, ground_truth: str) -> str: +def compute_f1_score(answer: str, ground_truth: str) -> float: import string import re diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py index 4d8fc742c03..2bfd31a8361 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py @@ -4,6 +4,8 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from pathlib import Path from promptflow.client import load_flow @@ -11,12 +13,15 @@ class FluencyEvaluator: - def __init__(self, model_config): + def __init__(self, model_config, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. :param model_config: Configuration for the Azure OpenAI model. :type model_config: AzureOpenAIModelConfiguration + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -31,7 +36,7 @@ def __init__(self, model_config): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection connection = convert_model_configuration_to_connection(model_config) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py index 5023ee640cc..db59fb73007 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py @@ -4,6 +4,8 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from pathlib import Path from promptflow.client import load_flow @@ -11,12 +13,15 @@ class GroundednessEvaluator: - def __init__(self, model_config): + def __init__(self, model_config, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. :param model_config: Configuration for the Azure OpenAI model. :type model_config: AzureOpenAIModelConfiguration + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -32,7 +37,7 @@ def __init__(self, model_config): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection connection = convert_model_configuration_to_connection(model_config) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py index 09955b6da95..687549ed62d 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py @@ -4,6 +4,8 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.evals.evaluators import ( CoherenceEvaluator, F1ScoreEvaluator, @@ -15,7 +17,8 @@ class QAEvaluator: - def __init__(self, model_config): + def __init__(self, model_config, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -23,6 +26,8 @@ def __init__(self, model_config): :type model_config: AzureOpenAIModelConfiguration :return: A function that evaluates and generates metrics for "question-answering" scenario. :rtype: function + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -37,12 +42,12 @@ def __init__(self, model_config): ) """ self._evaluators = [ - GroundednessEvaluator(model_config), - RelevanceEvaluator(model_config), - CoherenceEvaluator(model_config), - FluencyEvaluator(model_config), - SimilarityEvaluator(model_config), - F1ScoreEvaluator(), + GroundednessEvaluator(model_config, log_level=log_level), + RelevanceEvaluator(model_config, log_level=log_level), + CoherenceEvaluator(model_config, log_level=log_level), + FluencyEvaluator(model_config, log_level=log_level), + SimilarityEvaluator(model_config, log_level=log_level), + F1ScoreEvaluator(log_level=log_level), ] def __call__(self, *, question: str, answer: str, context: str, ground_truth: str, **kwargs): diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py index 6d1d89ad68a..9578f50fe1a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py @@ -4,6 +4,8 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from pathlib import Path from promptflow.client import load_flow @@ -11,12 +13,15 @@ class RelevanceEvaluator: - def __init__(self, model_config): + def __init__(self, model_config, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. :param model_config: Configuration for the Azure OpenAI model. :type model_config: AzureOpenAIModelConfiguration + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -33,7 +38,7 @@ def __init__(self, model_config): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection connection = convert_model_configuration_to_connection(model_config) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py index a36bd032a1f..000f4801ff9 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py @@ -4,6 +4,8 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from pathlib import Path from promptflow.client import load_flow @@ -11,12 +13,15 @@ class SimilarityEvaluator: - def __init__(self, model_config): + def __init__(self, model_config, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. :param model_config: Configuration for the Azure OpenAI model. :type model_config: AzureOpenAIModelConfiguration + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -32,7 +37,7 @@ def __init__(self, model_config): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection connection = convert_model_configuration_to_connection(model_config) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/synthetic/qa.py b/src/promptflow-evals/promptflow/evals/synthetic/qa.py index e56a2150d80..0fb91afa915 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/qa.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/qa.py @@ -31,7 +31,7 @@ if openai_version >= pkg_resources.parse_version("1.0.0"): _RETRY_ERRORS: Tuple = (openai.APIConnectionError, openai.APIError, openai.APIStatusError) else: - _RETRY_ERRORS: Tuple = ( + _RETRY_ERRORS = ( openai.error.ServiceUnavailableError, # pylint: disable=no-member openai.error.APIError, # pylint: disable=no-member openai.error.RateLimitError, # pylint: disable=no-member diff --git a/src/promptflow-evals/pyproject.toml b/src/promptflow-evals/pyproject.toml index d482a648868..c7c0b98e638 100644 --- a/src/promptflow-evals/pyproject.toml +++ b/src/promptflow-evals/pyproject.toml @@ -95,3 +95,12 @@ name = "Contract forbidden modules" type = "forbidden" source_modules = ["promptflow.evals"] forbidden_modules = [] + +[tool.mypy] +exclude = [ + "tests/evals" +] +warn_unused_configs = true +follow_imports = "skip" +ignore_missing_imports = true +follow_imports_for_stubs = false \ No newline at end of file diff --git a/src/promptflow-evals/tests/unittests/test_evaluator_logging.py b/src/promptflow-evals/tests/unittests/test_evaluator_logging.py new file mode 100644 index 00000000000..995278e22d8 --- /dev/null +++ b/src/promptflow-evals/tests/unittests/test_evaluator_logging.py @@ -0,0 +1,37 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +import pytest +import logging + +from unittest.mock import patch + +from promptflow.evals.evaluators.f1_score import F1ScoreEvaluator + + +class TestEvaluatorLogging: + + @pytest.mark.parametrize( + "log_level,expected", + [ + (logging.INFO, set(['flowinvoker'])), + (logging.WARNING, set()), + ]) + def test_f1_score_evaluator_logs(self, caplog, log_level, expected): + """Test logging with f1 score_evaluator.""" + # Note we are not checking for 'execution.flow' as caplog + # cannot catch it as this logger does not have a root logger as a parent. + def mock_get(name: str, verbosity: int = logging.INFO, target_stdout: bool = False): + logger = logging.getLogger(name) + logger.setLevel(verbosity) + return logger + + with patch('promptflow._utils.logger_utils.LoggerFactory') as mock_factory: + mock_factory.get_logger = mock_get + F1ScoreEvaluator(log_level=log_level)( + answer='June is the coldest summer month.', + ground_truth='January is the coldest winter month.' + ) + log_called = {lg.name for lg in caplog.records} + assert {'flowinvoker'}.intersection(log_called) == expected diff --git a/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml b/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml index 8388a9d7a44..e52427ad235 100644 --- a/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml +++ b/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml @@ -4,7 +4,7 @@ inputs: outputs: output: type: string - reference: ${print_val.output.value} + reference: ${print_val.output} nodes: - name: print_val type: python