feat(py): new get_experiment_results() endpoint (#2021)

EugeneJinXin · web-flow · commit 7df12ee1cc40 · 2025-09-18T14:33:06.000-07:00
get_experiment_results() and test case
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -92,7 +92,7 @@
     serialized_run_operation_to_multipart_parts_and_context,
 )
 from langsmith._internal._serde import dumps_json as _dumps_json
-from langsmith.schemas import AttachmentInfo
+from langsmith.schemas import AttachmentInfo, ExampleWithRuns
 
 
 def _check_otel_enabled() -> bool:
@@ -8269,6 +8269,142 @@ async def helpfulness(outputs: dict) -> dict:
             **kwargs,
         )
 
+    def _paginate_examples_with_runs(
+        self,
+        dataset_id: ID_TYPE,
+        session_id: uuid.UUID,
+        preview: bool = False,
+        comparative_experiment_id: Optional[uuid.UUID] = None,
+        filters: dict[uuid.UUID, list[str]] | None = None,
+        limit: Optional[int] = None,
+    ) -> Iterator[list[ExampleWithRuns]]:
+        """Paginate through examples with runs and yield batches.
+
+        Args:
+            dataset_id: Dataset UUID to fetch examples with runs
+            session_id: Session UUID to filter runs by, same as project_id
+            preview: Whether to return preview data only
+            comparative_experiment_id: Optional comparative experiment UUID
+            filters: Optional filters to apply
+            limit: Maximum total number of results to return
+
+        Yields:
+            Batches of run results as lists of ExampleWithRuns instances
+        """
+        offset = 0
+        results_count = 0
+
+        while True:
+            remaining = (limit - results_count) if limit else None
+            batch_limit = min(100, remaining) if remaining else 100
+
+            body = {
+                "session_ids": [session_id],
+                "offset": offset,
+                "limit": batch_limit,
+                "preview": preview,
+                "comparative_experiment_id": comparative_experiment_id,
+                "filters": filters,
+            }
+
+            response = self.request_with_retries(
+                "POST",
+                f"/datasets/{dataset_id}/runs",
+                request_kwargs={"data": _dumps_json(body)},
+            )
+
+            batch = response.json()
+            if not batch:
+                break
+
+            # Transform raw dictionaries to ExampleWithRuns instances
+            examples_batch = [ls_schemas.ExampleWithRuns(**result) for result in batch]
+            yield examples_batch
+            results_count += len(batch)
+
+            if len(batch) < batch_limit or (limit and results_count >= limit):
+                break
+
+            offset += len(batch)
+
+    def get_experiment_results(
+        self,
+        name: Optional[str] = None,
+        project_id: Optional[uuid.UUID] = None,
+        preview: bool = False,
+        comparative_experiment_id: Optional[uuid.UUID] = None,
+        filters: dict[uuid.UUID, list[str]] | None = None,
+        limit: Optional[int] = None,
+    ) -> ls_schemas.ExperimentResults:
+        """Get results for an experiment, including experiment session aggregated stats and experiment runs for each dataset example.
+
+        Experiment results may not be available immediately after the experiment is created.
+
+        Args:
+            name: The experiment name.
+            project_id: Experiment's tracing project id, also called session_id, can be found in the url of the LS experiment page
+            preview: Whether to return lightweight preview data only. When True,
+                fetches inputs_preview/outputs_preview summaries instead of full inputs/outputs from S3 storage.
+                Faster and less bandwidth.
+            comparative_experiment_id: Optional comparative experiment UUID for pairwise comparison experiment results.
+            filters: Optional filters to apply to results
+            limit: Maximum number of results to return
+
+        Returns:
+            ExperimentResults that has stats (TracerSessionResult) and iterator of examples_with_runs (ExampleWithRuns)
+
+        Raises:
+            ValueError: If project not found for the given session_id
+
+        Example:
+            >>> client = Client()
+            >>> results = client.get_experiment_results(
+            ...     project_id="037ae90f-f297-4926-b93c-37d8abf6899f",
+            ... )
+            >>> for example_with_runs in results["examples_with_runs"]:
+            ...     print(example_with_runs.dict())
+
+            >>> # Access aggregated experiment stats
+            >>> print(f"Total runs: {results['stats'].run_count}")
+            >>> print(f"Total cost: {results['stats'].total_cost}")
+            >>> print(f"P50 latency: {results['stats'].latency_p50}")
+
+        """
+        if name and not project_id:
+            projects = list(self.list_projects(name=name))
+            if not projects:
+                raise ValueError(f"No experiment found with name: '{name}'")
+            project_id = projects[0].id
+
+        # Get aggregated stats for the experiment project/session
+        project_stats = list(
+            self.list_projects(
+                project_ids=[cast(uuid.UUID, project_id)], include_stats=True
+            )
+        )
+
+        if not project_stats:
+            raise ValueError(f"No experiment found with project_id: '{project_id}'")
+
+        dataset_id = project_stats[0].reference_dataset_id
+
+        def _get_examples_with_runs_iterator():
+            """Yield examples with corresponding experiment runs."""
+            for batch in self._paginate_examples_with_runs(
+                dataset_id=dataset_id,
+                session_id=project_id,
+                preview=preview,
+                comparative_experiment_id=comparative_experiment_id,
+                filters=filters,
+                limit=limit,
+            ):
+                yield from batch
+
+        return ls_schemas.ExperimentResults(
+            stats=project_stats[0],
+            examples_with_runs=_get_examples_with_runs_iterator(),
+        )
+
 
 def convert_prompt_to_openai_format(
     messages: Any,
diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
@@ -648,7 +648,6 @@ def _reset_example_attachments(self, example: schemas.Example) -> schemas.Exampl
             outputs=example.outputs,
             metadata=example.metadata,
             modified_at=example.modified_at,
-            runs=example.runs,
             source_run_id=example.source_run_id,
             attachments=new_attachments,
             _host_url=example._host_url,
@@ -767,7 +766,6 @@ def _get_example_with_readers(self, example: schemas.Example) -> schemas.Example
             outputs=example.outputs,
             metadata=example.metadata,
             modified_at=example.modified_at,
-            runs=example.runs,
             source_run_id=example.source_run_id,
             attachments=new_attachments,
             _host_url=example._host_url,
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -1391,7 +1391,6 @@ def _reset_example_attachment_readers(
             outputs=example.outputs,
             metadata=example.metadata,
             modified_at=example.modified_at,
-            runs=example.runs,
             source_run_id=example.source_run_id,
             attachments=new_attachments,
             _host_url=example._host_url,
diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterator
 from datetime import datetime, timedelta, timezone
 from decimal import Decimal
 from enum import Enum
@@ -158,7 +159,6 @@ class Example(ExampleBase):
     )
     dataset_id: UUID = Field(default=UUID("00000000-0000-0000-0000-000000000000"))
     modified_at: Optional[datetime] = Field(default=None)
-    runs: list[Run] = Field(default_factory=list)
     source_run_id: Optional[UUID] = None
     attachments: Optional[dict[str, AttachmentInfo]] = Field(default=None)
     """Dictionary with attachment names as keys and a tuple of the S3 url
@@ -1261,3 +1261,18 @@ class UpsertExamplesResponse(TypedDict):
     """The number of examples that were upserted."""
     example_ids: list[str]
     """The ids of the examples that were upserted."""
+
+
+class ExampleWithRuns(Example):
+    """Example with runs."""
+
+    runs: list[Run] = Field(default_factory=list)
+
+    """The runs of the example."""
+
+
+class ExperimentResults(TypedDict):
+    """Results container for experiment data with stats and examples."""
+
+    stats: TracerSessionResult
+    examples_with_runs: Iterator[ExampleWithRuns]
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
@@ -3567,3 +3567,83 @@ def export_batch(self, run_ops, otel_context_map):
         readable_span.attributes[_otel_exporter.GENAI_COMPLETION]
         == '{"answer":"Hello, User!"}'
     )
+
+
+def test_get_experiment_results(langchain_client: Client) -> None:
+    """Test get_experiment_results method with evaluation data."""
+    dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
+    dataset = _create_dataset(langchain_client, dataset_name)
+
+    # Create example with attachments
+    example = ExampleCreate(
+        inputs={"question": "What is shown in the image?"},
+        outputs={"answer": "test image"},
+        attachments={
+            "image": ("image/png", b"fake image data for testing"),
+        },
+    )
+
+    langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
+
+    def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
+        # Verify we receive the attachment data
+        assert "image" in attachments
+        assert "presigned_url" in attachments["image"]
+        image_data = attachments["image"]["reader"]
+        assert image_data.read() == b"fake image data for testing"
+        return {"answer": "test image"}
+
+    def evaluator(
+        outputs: dict, reference_outputs: dict, attachments: dict
+    ) -> Dict[str, Any]:
+        assert "image" in attachments
+        assert "presigned_url" in attachments["image"]
+        image_data = attachments["image"]["reader"]
+        assert image_data.read() == b"fake image data for testing"
+        return {
+            "score": float(
+                reference_outputs.get("answer") == outputs.get("answer")  # type: ignore
+            )
+        }
+
+    results = langchain_client.evaluate(
+        target,
+        data=dataset_name,
+        evaluators=[evaluator],
+        num_repetitions=2,
+    )
+
+    assert len(results) == 2
+
+    experiment_name = results.experiment_name
+
+    time.sleep(10)
+    # Test get_experiment_results method
+    experiment_results = langchain_client.get_experiment_results(name=experiment_name)
+
+    # Test that we get stats
+    assert experiment_results["stats"] is not None
+    stats = experiment_results["stats"]
+    assert hasattr(stats, "run_count")
+    assert stats.run_count > 0
+
+    # Test that we get examples iterator
+    examples_list = list(experiment_results["examples_with_runs"])
+    assert len(examples_list) > 0
+    # Test with limit parameter
+    limited_results = langchain_client.get_experiment_results(
+        name=experiment_name, limit=1
+    )
+    limited_examples = list(limited_results["examples_with_runs"])
+    assert len(limited_examples) == 1
+
+    # Test stats are the same regardless of limit (since stats come from project)
+    assert limited_results["stats"].run_count == experiment_results["stats"].run_count
+
+    # Test preview mode - should be faster and return preview data
+    preview_results = langchain_client.get_experiment_results(
+        name=experiment_name, preview=True
+    )
+    assert len(list(preview_results["examples_with_runs"])) > 0
+
+    safe_delete_dataset(langchain_client, dataset_name=dataset_name)
diff --git a/vendor/orjson/test/test_api.py b/vendor/orjson/test/test_api.py
@@ -79,7 +79,7 @@ def test_loads_recursion_valid_limit_mixed(self):
         loads() recursion limit at limit mixed
         """
         n = LOADS_RECURSION_LIMIT
-        value = b"[" b'{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
+        value = b'[{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
         pytest.raises(orjson.JSONDecodeError, orjson.loads, value)
 
     def test_loads_recursion_valid_excessive_array(self):
@@ -111,7 +111,7 @@ def test_loads_recursion_valid_limit_mixed_pretty(self):
         loads() recursion limit at limit mixed pretty
         """
         n = LOADS_RECURSION_LIMIT
-        value = b"[\n  " b'{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
+        value = b'[\n  {"key":' * n + b'{"key":true}' + b"}" * n + b"]"
         pytest.raises(orjson.JSONDecodeError, orjson.loads, value)
 
     def test_loads_recursion_valid_excessive_array_pretty(self):