Skip to content

Commit 6c815df

Browse files
fix(llma): Langchain cache token double subtraction for non-Anthropic providers (#369)
1 parent 3a1b8e4 commit 6c815df

File tree

4 files changed

+86
-17
lines changed

4 files changed

+86
-17
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# 6.9.2 - 2025-11-10
2+
3+
- fix(llma): fix cache token double subtraction in Langchain for non-Anthropic providers causing negative costs
4+
15
# 6.9.1 - 2025-11-07
26

37
- fix(error-tracking): pass code variables config from init to client

posthog/ai/langchain/callbacks.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ def _capture_generation(
575575
event_properties["$ai_is_error"] = True
576576
else:
577577
# Add usage
578-
usage = _parse_usage(output)
578+
usage = _parse_usage(output, run.provider, run.model)
579579
event_properties["$ai_input_tokens"] = usage.input_tokens
580580
event_properties["$ai_output_tokens"] = usage.output_tokens
581581
event_properties["$ai_cache_creation_input_tokens"] = (
@@ -696,6 +696,8 @@ class ModelUsage:
696696

697697
def _parse_usage_model(
698698
usage: Union[BaseModel, dict],
699+
provider: Optional[str] = None,
700+
model: Optional[str] = None,
699701
) -> ModelUsage:
700702
if isinstance(usage, BaseModel):
701703
usage = usage.__dict__
@@ -764,16 +766,30 @@ def _parse_usage_model(
764766
for mapped_key, dataclass_key in field_mapping.items()
765767
},
766768
)
767-
# In LangChain, input_tokens is the sum of input and cache read tokens.
768-
# Our cost calculation expects them to be separate, for Anthropic.
769-
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
769+
# For Anthropic providers, LangChain reports input_tokens as the sum of input and cache read tokens.
770+
# Our cost calculation expects them to be separate for Anthropic, so we subtract cache tokens.
771+
# For other providers (OpenAI, etc.), input_tokens already includes cache tokens as expected.
772+
# Match logic consistent with plugin-server: exact match on provider OR substring match on model
773+
is_anthropic = False
774+
if provider and provider.lower() == "anthropic":
775+
is_anthropic = True
776+
elif model and "anthropic" in model.lower():
777+
is_anthropic = True
778+
779+
if (
780+
is_anthropic
781+
and normalized_usage.input_tokens
782+
and normalized_usage.cache_read_tokens
783+
):
770784
normalized_usage.input_tokens = max(
771785
normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0
772786
)
773787
return normalized_usage
774788

775789

776-
def _parse_usage(response: LLMResult) -> ModelUsage:
790+
def _parse_usage(
791+
response: LLMResult, provider: Optional[str] = None, model: Optional[str] = None
792+
) -> ModelUsage:
777793
# langchain-anthropic uses the usage field
778794
llm_usage_keys = ["token_usage", "usage"]
779795
llm_usage: ModelUsage = ModelUsage(
@@ -787,21 +803,25 @@ def _parse_usage(response: LLMResult) -> ModelUsage:
787803
if response.llm_output is not None:
788804
for key in llm_usage_keys:
789805
if response.llm_output.get(key):
790-
llm_usage = _parse_usage_model(response.llm_output[key])
806+
llm_usage = _parse_usage_model(
807+
response.llm_output[key], provider, model
808+
)
791809
break
792810

793811
if hasattr(response, "generations"):
794812
for generation in response.generations:
795813
if "usage" in generation:
796-
llm_usage = _parse_usage_model(generation["usage"])
814+
llm_usage = _parse_usage_model(generation["usage"], provider, model)
797815
break
798816

799817
for generation_chunk in generation:
800818
if generation_chunk.generation_info and (
801819
"usage_metadata" in generation_chunk.generation_info
802820
):
803821
llm_usage = _parse_usage_model(
804-
generation_chunk.generation_info["usage_metadata"]
822+
generation_chunk.generation_info["usage_metadata"],
823+
provider,
824+
model,
805825
)
806826
break
807827

@@ -828,7 +848,7 @@ def _parse_usage(response: LLMResult) -> ModelUsage:
828848
bedrock_anthropic_usage or bedrock_titan_usage or ollama_usage
829849
)
830850
if chunk_usage:
831-
llm_usage = _parse_usage_model(chunk_usage)
851+
llm_usage = _parse_usage_model(chunk_usage, provider, model)
832852
break
833853

834854
return llm_usage

posthog/test/ai/langchain/test_callbacks.py

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1584,13 +1584,58 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
15841584
generation_props = generation_args["properties"]
15851585

15861586
assert generation_args["event"] == "$ai_generation"
1587-
assert generation_props["$ai_input_tokens"] == 400
1587+
assert (
1588+
generation_props["$ai_input_tokens"] == 1200
1589+
) # No provider metadata, no subtraction
15881590
assert generation_props["$ai_output_tokens"] == 30
15891591
assert generation_props["$ai_cache_creation_input_tokens"] == 0
15901592
assert generation_props["$ai_cache_read_input_tokens"] == 800
15911593
assert generation_props["$ai_reasoning_tokens"] == 0
15921594

15931595

1596+
def test_anthropic_provider_subtracts_cache_tokens(mock_client):
1597+
"""Test that Anthropic provider correctly subtracts cache tokens from input tokens."""
1598+
from langchain_core.outputs import LLMResult, ChatGeneration
1599+
from langchain_core.messages import AIMessage
1600+
from uuid import uuid4
1601+
1602+
cb = CallbackHandler(mock_client)
1603+
run_id = uuid4()
1604+
1605+
# Set up with Anthropic provider
1606+
cb._set_llm_metadata(
1607+
serialized={},
1608+
run_id=run_id,
1609+
messages=[{"role": "user", "content": "test"}],
1610+
metadata={"ls_provider": "anthropic", "ls_model_name": "claude-3-sonnet"},
1611+
)
1612+
1613+
# Response with cache tokens: 1200 input (includes 800 cached)
1614+
response = LLMResult(
1615+
generations=[
1616+
[
1617+
ChatGeneration(
1618+
message=AIMessage(content="Response"),
1619+
generation_info={
1620+
"usage_metadata": {
1621+
"input_tokens": 1200,
1622+
"output_tokens": 50,
1623+
"cache_read_input_tokens": 800,
1624+
}
1625+
},
1626+
)
1627+
]
1628+
],
1629+
llm_output={},
1630+
)
1631+
1632+
cb._pop_run_and_capture_generation(run_id, None, response)
1633+
1634+
generation_args = mock_client.capture.call_args_list[0][1]
1635+
assert generation_args["properties"]["$ai_input_tokens"] == 400 # 1200 - 800
1636+
assert generation_args["properties"]["$ai_cache_read_input_tokens"] == 800
1637+
1638+
15941639
def test_openai_cache_read_tokens(mock_client):
15951640
"""Test that OpenAI cache read tokens are captured correctly."""
15961641
prompt = ChatPromptTemplate.from_messages(
@@ -1626,7 +1671,7 @@ def test_openai_cache_read_tokens(mock_client):
16261671
generation_props = generation_args["properties"]
16271672

16281673
assert generation_args["event"] == "$ai_generation"
1629-
assert generation_props["$ai_input_tokens"] == 50
1674+
assert generation_props["$ai_input_tokens"] == 150 # No subtraction for OpenAI
16301675
assert generation_props["$ai_output_tokens"] == 40
16311676
assert generation_props["$ai_cache_read_input_tokens"] == 100
16321677
assert generation_props["$ai_cache_creation_input_tokens"] == 0
@@ -1708,7 +1753,7 @@ def test_combined_reasoning_and_cache_tokens(mock_client):
17081753
generation_props = generation_args["properties"]
17091754

17101755
assert generation_args["event"] == "$ai_generation"
1711-
assert generation_props["$ai_input_tokens"] == 200
1756+
assert generation_props["$ai_input_tokens"] == 500 # No subtraction for OpenAI
17121757
assert generation_props["$ai_output_tokens"] == 100
17131758
assert generation_props["$ai_cache_read_input_tokens"] == 300
17141759
assert generation_props["$ai_cache_creation_input_tokens"] == 0
@@ -1917,8 +1962,8 @@ def test_cache_read_tokens_subtraction_from_input_tokens(mock_client):
19171962
generation_props = generation_args["properties"]
19181963

19191964
assert generation_args["event"] == "$ai_generation"
1920-
# Input tokens should be reduced: 150 - 100 = 50
1921-
assert generation_props["$ai_input_tokens"] == 50
1965+
# Input tokens not reduced without provider metadata
1966+
assert generation_props["$ai_input_tokens"] == 150
19221967
assert generation_props["$ai_output_tokens"] == 40
19231968
assert generation_props["$ai_cache_read_input_tokens"] == 100
19241969

@@ -1959,8 +2004,8 @@ def test_cache_read_tokens_subtraction_prevents_negative(mock_client):
19592004
generation_props = generation_args["properties"]
19602005

19612006
assert generation_args["event"] == "$ai_generation"
1962-
# Input tokens should be 0, not negative: max(80 - 100, 0) = 0
1963-
assert generation_props["$ai_input_tokens"] == 0
2007+
# Input tokens not reduced without provider metadata
2008+
assert generation_props["$ai_input_tokens"] == 80
19642009
assert generation_props["$ai_output_tokens"] == 20
19652010
assert generation_props["$ai_cache_read_input_tokens"] == 100
19662011

posthog/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = "6.9.1"
1+
VERSION = "6.9.2"
22

33
if __name__ == "__main__":
44
print(VERSION, end="") # noqa: T201

0 commit comments

Comments
 (0)