Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 6.7.10 - 2025-10-24

- fix(llma): cache cost calculation in the LangChain callback

# 6.7.9 - 2025-10-22

- fix(flags): multi-condition flags with static cohorts returning wrong variants
Expand Down
8 changes: 7 additions & 1 deletion posthog/ai/langchain/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,12 +750,18 @@ def _parse_usage_model(
"cache_read": "cache_read_tokens",
"reasoning": "reasoning_tokens",
}
return ModelUsage(
normalized_usage = ModelUsage(
**{
dataclass_key: parsed_usage.get(mapped_key) or 0
for mapped_key, dataclass_key in field_mapping.items()
},
)
# input_tokens is the sum of input and cache read tokens.
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
normalized_usage.input_tokens = max(
normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0
)
return normalized_usage


def _parse_usage(response: LLMResult) -> ModelUsage:
Expand Down
216 changes: 211 additions & 5 deletions posthog/test/ai/langchain/test_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1564,9 +1564,9 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
AIMessage(
content="Using cached analysis to provide quick response.",
usage_metadata={
"input_tokens": 200,
"input_tokens": 1200,
"output_tokens": 30,
"total_tokens": 1030,
"total_tokens": 1230,
"cache_read_input_tokens": 800, # Anthropic cache read
},
)
Expand All @@ -1583,7 +1583,7 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
assert generation_props["$ai_input_tokens"] == 200
assert generation_props["$ai_input_tokens"] == 400
assert generation_props["$ai_output_tokens"] == 30
assert generation_props["$ai_cache_creation_input_tokens"] == 0
assert generation_props["$ai_cache_read_input_tokens"] == 800
Expand Down Expand Up @@ -1625,7 +1625,7 @@ def test_openai_cache_read_tokens(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
assert generation_props["$ai_input_tokens"] == 150
assert generation_props["$ai_input_tokens"] == 50
assert generation_props["$ai_output_tokens"] == 40
assert generation_props["$ai_cache_read_input_tokens"] == 100
assert generation_props["$ai_cache_creation_input_tokens"] == 0
Expand Down Expand Up @@ -1707,7 +1707,7 @@ def test_combined_reasoning_and_cache_tokens(mock_client):
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
assert generation_props["$ai_input_tokens"] == 500
assert generation_props["$ai_input_tokens"] == 200
assert generation_props["$ai_output_tokens"] == 100
assert generation_props["$ai_cache_read_input_tokens"] == 300
assert generation_props["$ai_cache_creation_input_tokens"] == 0
Expand Down Expand Up @@ -1876,3 +1876,209 @@ def test_tool_definition(mock_client):
assert props["$ai_latency"] == 1.0
# Verify that tools are captured in the $ai_tools property
assert props["$ai_tools"] == tools


def test_cache_read_tokens_subtraction_from_input_tokens(mock_client):
"""Test that cache_read_tokens are properly subtracted from input_tokens.

This tests the logic in callbacks.py lines 757-758:
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
normalized_usage.input_tokens = max(normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0)
"""
prompt = ChatPromptTemplate.from_messages(
[("user", "Use the cached prompt for this request")]
)

# Scenario 1: input_tokens includes cache_read_tokens (typical case)
# input_tokens=150 includes 100 cache_read tokens, so actual input is 50
model = FakeMessagesListChatModel(
responses=[
AIMessage(
content="Response using cached prompt context.",
usage_metadata={
"input_tokens": 150, # Total includes cache reads
"output_tokens": 40,
"total_tokens": 190,
"cache_read_input_tokens": 100, # 100 tokens read from cache
},
)
]
)

callbacks = [CallbackHandler(mock_client)]
chain = prompt | model
result = chain.invoke({}, config={"callbacks": callbacks})

assert result.content == "Response using cached prompt context."
assert mock_client.capture.call_count == 3

generation_args = mock_client.capture.call_args_list[1][1]
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
# Input tokens should be reduced: 150 - 100 = 50
assert generation_props["$ai_input_tokens"] == 50
assert generation_props["$ai_output_tokens"] == 40
assert generation_props["$ai_cache_read_input_tokens"] == 100


def test_cache_read_tokens_subtraction_prevents_negative(mock_client):
"""Test that cache_read_tokens subtraction doesn't result in negative input_tokens.

This tests the max(..., 0) part of the logic in callbacks.py lines 757-758.
"""
prompt = ChatPromptTemplate.from_messages(
[("user", "Edge case with large cache read")]
)

# Edge case: cache_read_tokens >= input_tokens
# This could happen in some API responses where accounting differs
model = FakeMessagesListChatModel(
responses=[
AIMessage(
content="Response with edge case token counts.",
usage_metadata={
"input_tokens": 80,
"output_tokens": 20,
"total_tokens": 100,
"cache_read_input_tokens": 100, # More than input_tokens
},
)
]
)

callbacks = [CallbackHandler(mock_client)]
chain = prompt | model
result = chain.invoke({}, config={"callbacks": callbacks})

assert result.content == "Response with edge case token counts."
assert mock_client.capture.call_count == 3

generation_args = mock_client.capture.call_args_list[1][1]
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
# Input tokens should be 0, not negative: max(80 - 100, 0) = 0
assert generation_props["$ai_input_tokens"] == 0
assert generation_props["$ai_output_tokens"] == 20
assert generation_props["$ai_cache_read_input_tokens"] == 100


def test_no_cache_read_tokens_no_subtraction(mock_client):
"""Test that when there are no cache_read_tokens, input_tokens remain unchanged.

This tests the conditional check before the subtraction in callbacks.py line 757.
"""
prompt = ChatPromptTemplate.from_messages(
[("user", "Normal request without cache")]
)

# No cache usage - input_tokens should remain as-is
model = FakeMessagesListChatModel(
responses=[
AIMessage(
content="Response without cache.",
usage_metadata={
"input_tokens": 100,
"output_tokens": 30,
"total_tokens": 130,
# No cache_read_input_tokens
},
)
]
)

callbacks = [CallbackHandler(mock_client)]
chain = prompt | model
result = chain.invoke({}, config={"callbacks": callbacks})

assert result.content == "Response without cache."
assert mock_client.capture.call_count == 3

generation_args = mock_client.capture.call_args_list[1][1]
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
# Input tokens should remain unchanged at 100
assert generation_props["$ai_input_tokens"] == 100
assert generation_props["$ai_output_tokens"] == 30
assert generation_props["$ai_cache_read_input_tokens"] == 0


def test_zero_input_tokens_with_cache_read(mock_client):
"""Test edge case where input_tokens is 0 but cache_read_tokens exist.

This tests the falsy check in the conditional (line 757).
"""
prompt = ChatPromptTemplate.from_messages([("user", "Edge case query")])

# Edge case: input_tokens is 0 (falsy), should skip subtraction
model = FakeMessagesListChatModel(
responses=[
AIMessage(
content="Response.",
usage_metadata={
"input_tokens": 0,
"output_tokens": 10,
"total_tokens": 10,
"cache_read_input_tokens": 50,
},
)
]
)

callbacks = [CallbackHandler(mock_client)]
chain = prompt | model
result = chain.invoke({}, config={"callbacks": callbacks})

assert result.content == "Response."
assert mock_client.capture.call_count == 3

generation_args = mock_client.capture.call_args_list[1][1]
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
# Input tokens should remain 0 (no subtraction because input_tokens is falsy)
assert generation_props["$ai_input_tokens"] == 0
assert generation_props["$ai_output_tokens"] == 10
assert generation_props["$ai_cache_read_input_tokens"] == 50


def test_cache_write_tokens_not_subtracted_from_input(mock_client):
"""Test that cache_creation_input_tokens (cache write) do NOT affect input_tokens.

Only cache_read_tokens should be subtracted from input_tokens, not cache_write_tokens.
"""
prompt = ChatPromptTemplate.from_messages([("user", "Create cache")])

# Cache creation without cache read
model = FakeMessagesListChatModel(
responses=[
AIMessage(
content="Creating cache.",
usage_metadata={
"input_tokens": 1000,
"output_tokens": 20,
"total_tokens": 1020,
"cache_creation_input_tokens": 800, # Cache write, not read
},
)
]
)

callbacks = [CallbackHandler(mock_client)]
chain = prompt | model
result = chain.invoke({}, config={"callbacks": callbacks})

assert result.content == "Creating cache."
assert mock_client.capture.call_count == 3

generation_args = mock_client.capture.call_args_list[1][1]
generation_props = generation_args["properties"]

assert generation_args["event"] == "$ai_generation"
# Input tokens should NOT be reduced by cache_creation_input_tokens
assert generation_props["$ai_input_tokens"] == 1000
assert generation_props["$ai_output_tokens"] == 20
assert generation_props["$ai_cache_creation_input_tokens"] == 800
assert generation_props["$ai_cache_read_input_tokens"] == 0
2 changes: 1 addition & 1 deletion posthog/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "6.7.9"
VERSION = "6.7.10"

if __name__ == "__main__":
print(VERSION, end="") # noqa: T201
Loading