Fixes

microsoft · Apr 23, 2024 · 192ad93 · 192ad93
1 parent 4e1bfe4
commit 192ad93
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 20 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 import inspect
 import os
-import tempfile
+import shutil
 import uuid
 
 from types import FunctionType
@@ -102,25 +102,35 @@ def _apply_target_to_data(target: Callable, data: str, pf_client: PFClient) -> s
     :return: The path to data file with answers from target function.
     :rtype: str
     """
-    with tempfile.TemporaryDirectory() as d:
-        save_function_as_flow(fun=target, target_dir=d, pf=pf_client)
-        run = pf_client.run(
-            flow=d,
-            data=data,
-            name=f'preprocess_{uuid.uuid1()}'
-        )
-        run = pf_client.stream(run)
-        function_output = pd.read_json(pf_client.runs._get_outputs_path(run),
-                                       orient='records', lines=True)
+    # We are manually creating the temporary directory for the flow
+    # because the way tempdir remove temporary directories will
+    # hang the debugger, because promptflow will keep flow directory.
+    saved_flow = f'flow_{uuid.uuid1()}'
+    os.makedirs(saved_flow)
+    save_function_as_flow(fun=target, target_dir=saved_flow, pf=pf_client)
+    run = pf_client.run(
+        flow=saved_flow,
+        data=data,
+        name=f'preprocess_{uuid.uuid1()}'
+    )
+    run = pf_client.stream(run)
+    # Delete temporary directory if we can.
+    try:
+        shutil.rmtree(saved_flow)
+    except BaseException:
+        # Exception means, we are running in debugger. In this case we can keep the
+        # directory.
+        pass
+    function_output = pd.read_json(pf_client.runs._get_outputs_path(run),
+                                   orient='records', lines=True)
     function_output.set_index(LINE_NUMBER, inplace=True)
     function_output.sort_index(inplace=True)
     data_input = pd.read_json(data, orient='records', lines=True)
     data_input = pd.concat([data_input, function_output], axis=1, verify_integrity=True)
     del function_output
-    data_obj = tempfile.TemporaryFile(suffix='.jsonl', mode='w', delete=False)
-    data_obj.close()
-    data_input.to_json(data_obj.name, orient='records', lines=True, index=False)
-    return data_obj.name
+    new_data_name = f'{uuid.uuid1()}.jsonl'
+    data_input.to_json(new_data_name, orient='records', lines=True, index=False)
+    return new_data_name
 
 
 def evaluate(

diff --git a/src/promptflow-evals/tests/evals/e2etests/data/questions.jsonl b/src/promptflow-evals/tests/evals/e2etests/data/questions.jsonl
@@ -1,3 +1,3 @@
-{"question":"How long is flight from Earth to LV-426?"}
-{"question":"Why there is no central heating on the street?"}
-{"question":"Why these questions are so strange?"}
+{"question":"How long is flight from Earth to LV-426?","ground_truth":"Far away."}
+{"question":"Why there is no central heating on the street?","ground_truth":"It is expensive."}
+{"question":"Why these questions are so strange?","ground_truth":"The life is strange..."}
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -99,12 +99,15 @@ def test_evaluate_python_function(self, data_file):
 
     def test_evaluate_with_target(self, questions_file):
         """Test evaluation with target function."""
+        f1_score_eval = F1ScoreEvaluator()
         # run the evaluation with targets
         result = evaluate(
             data=questions_file,
             target=_target_fn,
-            evaluators={"answer": answer_evaluator},
+            evaluators={"answer": answer_evaluator, 'f1': f1_score_eval},
         )
         row_result_df = pd.DataFrame(result["rows"])
-        print(row_result_df)
+        assert "outputs.answer.length" in row_result_df.columns
         assert list(row_result_df["outputs.answer.length"]) == [28, 76, 22]
+        assert "outputs.f1.f1_score" in row_result_df.columns
+        assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"])