Skip to content

Commit 7df12ee

Browse files
authored
feat(py): new get_experiment_results() endpoint (#2021)
get_experiment_results() and test case
1 parent 7b9f5a5 commit 7df12ee

File tree

6 files changed

+235
-7
lines changed

6 files changed

+235
-7
lines changed

python/langsmith/client.py

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
serialized_run_operation_to_multipart_parts_and_context,
9393
)
9494
from langsmith._internal._serde import dumps_json as _dumps_json
95-
from langsmith.schemas import AttachmentInfo
95+
from langsmith.schemas import AttachmentInfo, ExampleWithRuns
9696

9797

9898
def _check_otel_enabled() -> bool:
@@ -8269,6 +8269,142 @@ async def helpfulness(outputs: dict) -> dict:
82698269
**kwargs,
82708270
)
82718271

8272+
def _paginate_examples_with_runs(
8273+
self,
8274+
dataset_id: ID_TYPE,
8275+
session_id: uuid.UUID,
8276+
preview: bool = False,
8277+
comparative_experiment_id: Optional[uuid.UUID] = None,
8278+
filters: dict[uuid.UUID, list[str]] | None = None,
8279+
limit: Optional[int] = None,
8280+
) -> Iterator[list[ExampleWithRuns]]:
8281+
"""Paginate through examples with runs and yield batches.
8282+
8283+
Args:
8284+
dataset_id: Dataset UUID to fetch examples with runs
8285+
session_id: Session UUID to filter runs by, same as project_id
8286+
preview: Whether to return preview data only
8287+
comparative_experiment_id: Optional comparative experiment UUID
8288+
filters: Optional filters to apply
8289+
limit: Maximum total number of results to return
8290+
8291+
Yields:
8292+
Batches of run results as lists of ExampleWithRuns instances
8293+
"""
8294+
offset = 0
8295+
results_count = 0
8296+
8297+
while True:
8298+
remaining = (limit - results_count) if limit else None
8299+
batch_limit = min(100, remaining) if remaining else 100
8300+
8301+
body = {
8302+
"session_ids": [session_id],
8303+
"offset": offset,
8304+
"limit": batch_limit,
8305+
"preview": preview,
8306+
"comparative_experiment_id": comparative_experiment_id,
8307+
"filters": filters,
8308+
}
8309+
8310+
response = self.request_with_retries(
8311+
"POST",
8312+
f"/datasets/{dataset_id}/runs",
8313+
request_kwargs={"data": _dumps_json(body)},
8314+
)
8315+
8316+
batch = response.json()
8317+
if not batch:
8318+
break
8319+
8320+
# Transform raw dictionaries to ExampleWithRuns instances
8321+
examples_batch = [ls_schemas.ExampleWithRuns(**result) for result in batch]
8322+
yield examples_batch
8323+
results_count += len(batch)
8324+
8325+
if len(batch) < batch_limit or (limit and results_count >= limit):
8326+
break
8327+
8328+
offset += len(batch)
8329+
8330+
def get_experiment_results(
8331+
self,
8332+
name: Optional[str] = None,
8333+
project_id: Optional[uuid.UUID] = None,
8334+
preview: bool = False,
8335+
comparative_experiment_id: Optional[uuid.UUID] = None,
8336+
filters: dict[uuid.UUID, list[str]] | None = None,
8337+
limit: Optional[int] = None,
8338+
) -> ls_schemas.ExperimentResults:
8339+
"""Get results for an experiment, including experiment session aggregated stats and experiment runs for each dataset example.
8340+
8341+
Experiment results may not be available immediately after the experiment is created.
8342+
8343+
Args:
8344+
name: The experiment name.
8345+
project_id: Experiment's tracing project id, also called session_id, can be found in the url of the LS experiment page
8346+
preview: Whether to return lightweight preview data only. When True,
8347+
fetches inputs_preview/outputs_preview summaries instead of full inputs/outputs from S3 storage.
8348+
Faster and less bandwidth.
8349+
comparative_experiment_id: Optional comparative experiment UUID for pairwise comparison experiment results.
8350+
filters: Optional filters to apply to results
8351+
limit: Maximum number of results to return
8352+
8353+
Returns:
8354+
ExperimentResults that has stats (TracerSessionResult) and iterator of examples_with_runs (ExampleWithRuns)
8355+
8356+
Raises:
8357+
ValueError: If project not found for the given session_id
8358+
8359+
Example:
8360+
>>> client = Client()
8361+
>>> results = client.get_experiment_results(
8362+
... project_id="037ae90f-f297-4926-b93c-37d8abf6899f",
8363+
... )
8364+
>>> for example_with_runs in results["examples_with_runs"]:
8365+
... print(example_with_runs.dict())
8366+
8367+
>>> # Access aggregated experiment stats
8368+
>>> print(f"Total runs: {results['stats'].run_count}")
8369+
>>> print(f"Total cost: {results['stats'].total_cost}")
8370+
>>> print(f"P50 latency: {results['stats'].latency_p50}")
8371+
8372+
"""
8373+
if name and not project_id:
8374+
projects = list(self.list_projects(name=name))
8375+
if not projects:
8376+
raise ValueError(f"No experiment found with name: '{name}'")
8377+
project_id = projects[0].id
8378+
8379+
# Get aggregated stats for the experiment project/session
8380+
project_stats = list(
8381+
self.list_projects(
8382+
project_ids=[cast(uuid.UUID, project_id)], include_stats=True
8383+
)
8384+
)
8385+
8386+
if not project_stats:
8387+
raise ValueError(f"No experiment found with project_id: '{project_id}'")
8388+
8389+
dataset_id = project_stats[0].reference_dataset_id
8390+
8391+
def _get_examples_with_runs_iterator():
8392+
"""Yield examples with corresponding experiment runs."""
8393+
for batch in self._paginate_examples_with_runs(
8394+
dataset_id=dataset_id,
8395+
session_id=project_id,
8396+
preview=preview,
8397+
comparative_experiment_id=comparative_experiment_id,
8398+
filters=filters,
8399+
limit=limit,
8400+
):
8401+
yield from batch
8402+
8403+
return ls_schemas.ExperimentResults(
8404+
stats=project_stats[0],
8405+
examples_with_runs=_get_examples_with_runs_iterator(),
8406+
)
8407+
82728408

82738409
def convert_prompt_to_openai_format(
82748410
messages: Any,

python/langsmith/evaluation/_arunner.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,6 @@ def _reset_example_attachments(self, example: schemas.Example) -> schemas.Exampl
648648
outputs=example.outputs,
649649
metadata=example.metadata,
650650
modified_at=example.modified_at,
651-
runs=example.runs,
652651
source_run_id=example.source_run_id,
653652
attachments=new_attachments,
654653
_host_url=example._host_url,
@@ -767,7 +766,6 @@ def _get_example_with_readers(self, example: schemas.Example) -> schemas.Example
767766
outputs=example.outputs,
768767
metadata=example.metadata,
769768
modified_at=example.modified_at,
770-
runs=example.runs,
771769
source_run_id=example.source_run_id,
772770
attachments=new_attachments,
773771
_host_url=example._host_url,

python/langsmith/evaluation/_runner.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1391,7 +1391,6 @@ def _reset_example_attachment_readers(
13911391
outputs=example.outputs,
13921392
metadata=example.metadata,
13931393
modified_at=example.modified_at,
1394-
runs=example.runs,
13951394
source_run_id=example.source_run_id,
13961395
attachments=new_attachments,
13971396
_host_url=example._host_url,

python/langsmith/schemas.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
from collections.abc import Iterator
56
from datetime import datetime, timedelta, timezone
67
from decimal import Decimal
78
from enum import Enum
@@ -158,7 +159,6 @@ class Example(ExampleBase):
158159
)
159160
dataset_id: UUID = Field(default=UUID("00000000-0000-0000-0000-000000000000"))
160161
modified_at: Optional[datetime] = Field(default=None)
161-
runs: list[Run] = Field(default_factory=list)
162162
source_run_id: Optional[UUID] = None
163163
attachments: Optional[dict[str, AttachmentInfo]] = Field(default=None)
164164
"""Dictionary with attachment names as keys and a tuple of the S3 url
@@ -1261,3 +1261,18 @@ class UpsertExamplesResponse(TypedDict):
12611261
"""The number of examples that were upserted."""
12621262
example_ids: list[str]
12631263
"""The ids of the examples that were upserted."""
1264+
1265+
1266+
class ExampleWithRuns(Example):
1267+
"""Example with runs."""
1268+
1269+
runs: list[Run] = Field(default_factory=list)
1270+
1271+
"""The runs of the example."""
1272+
1273+
1274+
class ExperimentResults(TypedDict):
1275+
"""Results container for experiment data with stats and examples."""
1276+
1277+
stats: TracerSessionResult
1278+
examples_with_runs: Iterator[ExampleWithRuns]

python/tests/integration_tests/test_client.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3567,3 +3567,83 @@ def export_batch(self, run_ops, otel_context_map):
35673567
readable_span.attributes[_otel_exporter.GENAI_COMPLETION]
35683568
== '{"answer":"Hello, User!"}'
35693569
)
3570+
3571+
3572+
def test_get_experiment_results(langchain_client: Client) -> None:
3573+
"""Test get_experiment_results method with evaluation data."""
3574+
dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
3575+
dataset = _create_dataset(langchain_client, dataset_name)
3576+
3577+
# Create example with attachments
3578+
example = ExampleCreate(
3579+
inputs={"question": "What is shown in the image?"},
3580+
outputs={"answer": "test image"},
3581+
attachments={
3582+
"image": ("image/png", b"fake image data for testing"),
3583+
},
3584+
)
3585+
3586+
langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
3587+
3588+
def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
3589+
# Verify we receive the attachment data
3590+
assert "image" in attachments
3591+
assert "presigned_url" in attachments["image"]
3592+
image_data = attachments["image"]["reader"]
3593+
assert image_data.read() == b"fake image data for testing"
3594+
return {"answer": "test image"}
3595+
3596+
def evaluator(
3597+
outputs: dict, reference_outputs: dict, attachments: dict
3598+
) -> Dict[str, Any]:
3599+
assert "image" in attachments
3600+
assert "presigned_url" in attachments["image"]
3601+
image_data = attachments["image"]["reader"]
3602+
assert image_data.read() == b"fake image data for testing"
3603+
return {
3604+
"score": float(
3605+
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
3606+
)
3607+
}
3608+
3609+
results = langchain_client.evaluate(
3610+
target,
3611+
data=dataset_name,
3612+
evaluators=[evaluator],
3613+
num_repetitions=2,
3614+
)
3615+
3616+
assert len(results) == 2
3617+
3618+
experiment_name = results.experiment_name
3619+
3620+
time.sleep(10)
3621+
# Test get_experiment_results method
3622+
experiment_results = langchain_client.get_experiment_results(name=experiment_name)
3623+
3624+
# Test that we get stats
3625+
assert experiment_results["stats"] is not None
3626+
stats = experiment_results["stats"]
3627+
assert hasattr(stats, "run_count")
3628+
assert stats.run_count > 0
3629+
3630+
# Test that we get examples iterator
3631+
examples_list = list(experiment_results["examples_with_runs"])
3632+
assert len(examples_list) > 0
3633+
# Test with limit parameter
3634+
limited_results = langchain_client.get_experiment_results(
3635+
name=experiment_name, limit=1
3636+
)
3637+
limited_examples = list(limited_results["examples_with_runs"])
3638+
assert len(limited_examples) == 1
3639+
3640+
# Test stats are the same regardless of limit (since stats come from project)
3641+
assert limited_results["stats"].run_count == experiment_results["stats"].run_count
3642+
3643+
# Test preview mode - should be faster and return preview data
3644+
preview_results = langchain_client.get_experiment_results(
3645+
name=experiment_name, preview=True
3646+
)
3647+
assert len(list(preview_results["examples_with_runs"])) > 0
3648+
3649+
safe_delete_dataset(langchain_client, dataset_name=dataset_name)

vendor/orjson/test/test_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def test_loads_recursion_valid_limit_mixed(self):
7979
loads() recursion limit at limit mixed
8080
"""
8181
n = LOADS_RECURSION_LIMIT
82-
value = b"[" b'{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
82+
value = b'[{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
8383
pytest.raises(orjson.JSONDecodeError, orjson.loads, value)
8484

8585
def test_loads_recursion_valid_excessive_array(self):
@@ -111,7 +111,7 @@ def test_loads_recursion_valid_limit_mixed_pretty(self):
111111
loads() recursion limit at limit mixed pretty
112112
"""
113113
n = LOADS_RECURSION_LIMIT
114-
value = b"[\n " b'{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
114+
value = b'[\n {"key":' * n + b'{"key":true}' + b"}" * n + b"]"
115115
pytest.raises(orjson.JSONDecodeError, orjson.loads, value)
116116

117117
def test_loads_recursion_valid_excessive_array_pretty(self):

0 commit comments

Comments
 (0)