[SDK]persist input from run results to avoid input missing in run results (#855)

D-W- · web-flow · commit ecf9d07d70da · 2023-10-24T11:43:55.000+08:00
# Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. Before: ![image](https://github.com/microsoft/promptflow/assets/7776147/1d9aa40f-3bc9-487c-9249-34558aba39c2) After: ![image](https://github.com/microsoft/promptflow/assets/7776147/4628d473-0912-41dd-a442-09c39742a2b4) # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
diff --git a/src/promptflow/promptflow/_sdk/operations/_local_storage_operations.py b/src/promptflow/promptflow/_sdk/operations/_local_storage_operations.py
@@ -36,6 +36,7 @@
 from promptflow.contracts.run_info import RunInfo as NodeRunInfo
 from promptflow.contracts.run_info import Status
 from promptflow.contracts.run_mode import RunMode
+from promptflow.executor._result import LineResult
 from promptflow.executor.flow_executor import BulkResult
 from promptflow.storage import AbstractRunStorage
 
@@ -239,7 +240,14 @@ def load_io_spec(self) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, s
             flow_dag = yaml.safe_load(f)
         return flow_dag["inputs"], flow_dag["outputs"]
 
-    def dump_inputs(self, inputs: RunInputs) -> None:
+    def dump_inputs(self, line_results: List[LineResult]) -> None:
+        inputs = []
+        for line_result in line_results:
+            try:
+                inputs.append(line_result.run_info.inputs)
+            except Exception:
+                # ignore when single line doesn't have inputs
+                pass
         df = pd.DataFrame(inputs)
         with open(self._inputs_path, mode="w", encoding=DEFAULT_ENCODING) as f:
             # policy: http://policheck.azurewebsites.net/Pages/TermInfo.aspx?LCID=9&TermID=203588
@@ -389,6 +397,7 @@ def persist_result(self, result: Optional[BulkResult]) -> None:
             return
         self.dump_outputs(result.outputs)
         self.dump_metrics(result.metrics)
+        self.dump_inputs(result.line_results)
 
     @staticmethod
     def _prepare_folder(path: Union[str, Path]) -> Path:
diff --git a/src/promptflow/promptflow/_sdk/operations/_run_submitter.py b/src/promptflow/promptflow/_sdk/operations/_run_submitter.py
@@ -330,8 +330,7 @@ def _submit_bulk_run(self, flow: Flow, run: Run, local_storage: LocalStorageOper
             # persist snapshot and result
             # snapshot: flow directory and (mapped) inputs
             local_storage.dump_snapshot(flow)
-            local_storage.dump_inputs(mapped_inputs)
-            # result: outputs and metrics
+            # persist inputs, outputs and metrics
             local_storage.persist_result(bulk_result)
             # exceptions
             local_storage.dump_exception(exception=exception, bulk_results=bulk_result)
diff --git a/src/promptflow/tests/sdk_cli_test/e2etests/test_flow_run.py b/src/promptflow/tests/sdk_cli_test/e2etests/test_flow_run.py
@@ -745,3 +745,33 @@ def test_system_metrics_in_properties(self, pf) -> None:
         assert FlowRunProperties.SYSTEM_METRICS in run.properties
         assert isinstance(run.properties[FlowRunProperties.SYSTEM_METRICS], dict)
         assert "total_tokens" in run.properties[FlowRunProperties.SYSTEM_METRICS]
+
+    def test_run_get_inputs(self, pf):
+        # inputs should be persisted when defaults are used
+        run = pf.run(
+            flow=f"{FLOWS_DIR}/default_input",
+            data=f"{DATAS_DIR}/webClassification1.jsonl",
+        )
+        inputs = pf.runs._get_inputs(run=run)
+        assert inputs == {"line_number": [0], "question": ["input value from default"]}
+
+        # inputs should be persisted when data value are used
+        run = pf.run(
+            flow=f"{FLOWS_DIR}/flow_with_dict_input",
+            data=f"{DATAS_DIR}/dictInput1.jsonl",
+        )
+        inputs = pf.runs._get_inputs(run=run)
+        assert inputs == {"key": [{"key": "value in data"}], "line_number": [0]}
+
+        # inputs should be persisted when column-mapping are used
+        run = pf.run(
+            flow=f"{FLOWS_DIR}/flow_with_dict_input",
+            data=f"{DATAS_DIR}/webClassification1.jsonl",
+            column_mapping={"key": {"value": "value in column-mapping"}, "url": "${data.url}"},
+        )
+        inputs = pf.runs._get_inputs(run=run)
+        assert inputs == {
+            "key": [{"value": "value in column-mapping"}],
+            "line_number": [0],
+            "url": ["https://www.youtube.com/watch?v=o5ZQyXaAv1g"],
+        }
diff --git a/src/promptflow/tests/test_configs/datas/dictInput1.jsonl b/src/promptflow/tests/test_configs/datas/dictInput1.jsonl
@@ -0,0 +1 @@
+{"key": {"key": "value in data"}}