Skip to content

Commit 9f37067

Browse files
feat(llma): redact base64 images (#318)
1 parent 6e00d57 commit 9f37067

File tree

11 files changed

+622
-14
lines changed

11 files changed

+622
-14
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# 6.7.1 - 2025-09-01
2+
3+
- fix: Add base64 inline image sanitization
4+
15
# 6.7.0 - 2025-08-26
26

37
- feat: Add support for feature flag dependencies

posthog/ai/anthropic/anthropic.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
merge_system_prompt,
1717
with_privacy_mode,
1818
)
19+
from posthog.ai.sanitization import sanitize_anthropic
1920
from posthog.client import Client as PostHogClient
2021
from posthog import setup
2122

@@ -184,7 +185,7 @@ def _capture_streaming_event(
184185
"$ai_input": with_privacy_mode(
185186
self._client._ph_client,
186187
posthog_privacy_mode,
187-
merge_system_prompt(kwargs, "anthropic"),
188+
sanitize_anthropic(merge_system_prompt(kwargs, "anthropic")),
188189
),
189190
"$ai_output_choices": with_privacy_mode(
190191
self._client._ph_client,

posthog/ai/anthropic/anthropic_async.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
merge_system_prompt,
1818
with_privacy_mode,
1919
)
20+
from posthog.ai.sanitization import sanitize_anthropic
2021
from posthog.client import Client as PostHogClient
2122

2223

@@ -184,7 +185,7 @@ async def _capture_streaming_event(
184185
"$ai_input": with_privacy_mode(
185186
self._client._ph_client,
186187
posthog_privacy_mode,
187-
merge_system_prompt(kwargs, "anthropic"),
188+
sanitize_anthropic(merge_system_prompt(kwargs, "anthropic")),
188189
),
189190
"$ai_output_choices": with_privacy_mode(
190191
self._client._ph_client,

posthog/ai/gemini/gemini.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
get_model_params,
1717
with_privacy_mode,
1818
)
19+
from posthog.ai.sanitization import sanitize_gemini
1920
from posthog.client import Client as PostHogClient
2021

2122

@@ -347,7 +348,7 @@ def _capture_streaming_event(
347348
"$ai_input": with_privacy_mode(
348349
self._ph_client,
349350
privacy_mode,
350-
self._format_input(contents),
351+
sanitize_gemini(self._format_input(contents)),
351352
),
352353
"$ai_output_choices": with_privacy_mode(
353354
self._ph_client,

posthog/ai/langchain/callbacks.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737

3838
from posthog import setup
3939
from posthog.ai.utils import get_model_params, with_privacy_mode
40+
from posthog.ai.sanitization import sanitize_langchain
4041
from posthog.client import Client
4142

4243
log = logging.getLogger("posthog")
@@ -480,7 +481,7 @@ def _capture_trace_or_span(
480481
event_properties = {
481482
"$ai_trace_id": trace_id,
482483
"$ai_input_state": with_privacy_mode(
483-
self._ph_client, self._privacy_mode, run.input
484+
self._ph_client, self._privacy_mode, sanitize_langchain(run.input)
484485
),
485486
"$ai_latency": run.latency,
486487
"$ai_span_name": run.name,
@@ -550,7 +551,7 @@ def _capture_generation(
550551
"$ai_model": run.model,
551552
"$ai_model_parameters": run.model_params,
552553
"$ai_input": with_privacy_mode(
553-
self._ph_client, self._privacy_mode, run.input
554+
self._ph_client, self._privacy_mode, sanitize_langchain(run.input)
554555
),
555556
"$ai_http_status": 200,
556557
"$ai_latency": run.latency,

posthog/ai/openai/openai.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
get_model_params,
1616
with_privacy_mode,
1717
)
18+
from posthog.ai.sanitization import sanitize_openai, sanitize_openai_response
1819
from posthog.client import Client as PostHogClient
1920
from posthog import setup
2021

@@ -194,7 +195,9 @@ def _capture_streaming_event(
194195
"$ai_model": kwargs.get("model"),
195196
"$ai_model_parameters": get_model_params(kwargs),
196197
"$ai_input": with_privacy_mode(
197-
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
198+
self._client._ph_client,
199+
posthog_privacy_mode,
200+
sanitize_openai_response(kwargs.get("input")),
198201
),
199202
"$ai_output_choices": with_privacy_mode(
200203
self._client._ph_client,
@@ -427,7 +430,9 @@ def _capture_streaming_event(
427430
"$ai_model": kwargs.get("model"),
428431
"$ai_model_parameters": get_model_params(kwargs),
429432
"$ai_input": with_privacy_mode(
430-
self._client._ph_client, posthog_privacy_mode, kwargs.get("messages")
433+
self._client._ph_client,
434+
posthog_privacy_mode,
435+
sanitize_openai(kwargs.get("messages")),
431436
),
432437
"$ai_output_choices": with_privacy_mode(
433438
self._client._ph_client,
@@ -518,7 +523,9 @@ def create(
518523
"$ai_provider": "openai",
519524
"$ai_model": kwargs.get("model"),
520525
"$ai_input": with_privacy_mode(
521-
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
526+
self._client._ph_client,
527+
posthog_privacy_mode,
528+
sanitize_openai_response(kwargs.get("input")),
522529
),
523530
"$ai_http_status": 200,
524531
"$ai_input_tokens": usage_stats.get("prompt_tokens", 0),

posthog/ai/openai/openai_async.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
get_model_params,
1717
with_privacy_mode,
1818
)
19+
from posthog.ai.sanitization import sanitize_openai, sanitize_openai_response
1920
from posthog.client import Client as PostHogClient
2021

2122

@@ -195,7 +196,9 @@ async def _capture_streaming_event(
195196
"$ai_model": kwargs.get("model"),
196197
"$ai_model_parameters": get_model_params(kwargs),
197198
"$ai_input": with_privacy_mode(
198-
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
199+
self._client._ph_client,
200+
posthog_privacy_mode,
201+
sanitize_openai_response(kwargs.get("input")),
199202
),
200203
"$ai_output_choices": with_privacy_mode(
201204
self._client._ph_client,
@@ -431,7 +434,9 @@ async def _capture_streaming_event(
431434
"$ai_model": kwargs.get("model"),
432435
"$ai_model_parameters": get_model_params(kwargs),
433436
"$ai_input": with_privacy_mode(
434-
self._client._ph_client, posthog_privacy_mode, kwargs.get("messages")
437+
self._client._ph_client,
438+
posthog_privacy_mode,
439+
sanitize_openai(kwargs.get("messages")),
435440
),
436441
"$ai_output_choices": with_privacy_mode(
437442
self._client._ph_client,
@@ -522,7 +527,9 @@ async def create(
522527
"$ai_provider": "openai",
523528
"$ai_model": kwargs.get("model"),
524529
"$ai_input": with_privacy_mode(
525-
self._client._ph_client, posthog_privacy_mode, kwargs.get("input")
530+
self._client._ph_client,
531+
posthog_privacy_mode,
532+
sanitize_openai_response(kwargs.get("input")),
526533
),
527534
"$ai_http_status": 200,
528535
"$ai_input_tokens": usage_stats.get("prompt_tokens", 0),

posthog/ai/sanitization.py

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
import re
2+
from typing import Any
3+
from urllib.parse import urlparse
4+
5+
REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"
6+
7+
8+
def is_base64_data_url(text: str) -> bool:
9+
return re.match(r"^data:([^;]+);base64,", text) is not None
10+
11+
12+
def is_valid_url(text: str) -> bool:
13+
try:
14+
result = urlparse(text)
15+
return bool(result.scheme and result.netloc)
16+
except Exception:
17+
pass
18+
19+
return text.startswith(("/", "./", "../"))
20+
21+
22+
def is_raw_base64(text: str) -> bool:
23+
if is_valid_url(text):
24+
return False
25+
26+
return len(text) > 20 and re.match(r"^[A-Za-z0-9+/]+=*$", text) is not None
27+
28+
29+
def redact_base64_data_url(value: Any) -> Any:
30+
if not isinstance(value, str):
31+
return value
32+
33+
if is_base64_data_url(value):
34+
return REDACTED_IMAGE_PLACEHOLDER
35+
36+
if is_raw_base64(value):
37+
return REDACTED_IMAGE_PLACEHOLDER
38+
39+
return value
40+
41+
42+
def process_messages(messages: Any, transform_content_func) -> Any:
43+
if not messages:
44+
return messages
45+
46+
def process_content(content: Any) -> Any:
47+
if isinstance(content, str):
48+
return content
49+
50+
if not content:
51+
return content
52+
53+
if isinstance(content, list):
54+
return [transform_content_func(item) for item in content]
55+
56+
return transform_content_func(content)
57+
58+
def process_message(msg: Any) -> Any:
59+
if not isinstance(msg, dict) or "content" not in msg:
60+
return msg
61+
return {**msg, "content": process_content(msg["content"])}
62+
63+
if isinstance(messages, list):
64+
return [process_message(msg) for msg in messages]
65+
66+
return process_message(messages)
67+
68+
69+
def sanitize_openai_image(item: Any) -> Any:
70+
if not isinstance(item, dict):
71+
return item
72+
73+
if (
74+
item.get("type") == "image_url"
75+
and isinstance(item.get("image_url"), dict)
76+
and "url" in item["image_url"]
77+
):
78+
return {
79+
**item,
80+
"image_url": {
81+
**item["image_url"],
82+
"url": redact_base64_data_url(item["image_url"]["url"]),
83+
},
84+
}
85+
86+
return item
87+
88+
89+
def sanitize_openai_response_image(item: Any) -> Any:
90+
if not isinstance(item, dict):
91+
return item
92+
93+
if item.get("type") == "input_image" and "image_url" in item:
94+
return {
95+
**item,
96+
"image_url": redact_base64_data_url(item["image_url"]),
97+
}
98+
99+
return item
100+
101+
102+
def sanitize_anthropic_image(item: Any) -> Any:
103+
if not isinstance(item, dict):
104+
return item
105+
106+
if (
107+
item.get("type") == "image"
108+
and isinstance(item.get("source"), dict)
109+
and item["source"].get("type") == "base64"
110+
and "data" in item["source"]
111+
):
112+
# For Anthropic, if the source type is "base64", we should always redact the data
113+
# The provider is explicitly telling us this is base64 data
114+
return {
115+
**item,
116+
"source": {
117+
**item["source"],
118+
"data": REDACTED_IMAGE_PLACEHOLDER,
119+
},
120+
}
121+
122+
return item
123+
124+
125+
def sanitize_gemini_part(part: Any) -> Any:
126+
if not isinstance(part, dict):
127+
return part
128+
129+
if (
130+
"inline_data" in part
131+
and isinstance(part["inline_data"], dict)
132+
and "data" in part["inline_data"]
133+
):
134+
# For Gemini, the inline_data structure indicates base64 data
135+
# We should redact any string data in this context
136+
return {
137+
**part,
138+
"inline_data": {
139+
**part["inline_data"],
140+
"data": REDACTED_IMAGE_PLACEHOLDER,
141+
},
142+
}
143+
144+
return part
145+
146+
147+
def process_gemini_item(item: Any) -> Any:
148+
if not isinstance(item, dict):
149+
return item
150+
151+
if "parts" in item and item["parts"]:
152+
parts = item["parts"]
153+
if isinstance(parts, list):
154+
parts = [sanitize_gemini_part(part) for part in parts]
155+
else:
156+
parts = sanitize_gemini_part(parts)
157+
158+
return {**item, "parts": parts}
159+
160+
return item
161+
162+
163+
def sanitize_langchain_image(item: Any) -> Any:
164+
if not isinstance(item, dict):
165+
return item
166+
167+
if (
168+
item.get("type") == "image_url"
169+
and isinstance(item.get("image_url"), dict)
170+
and "url" in item["image_url"]
171+
):
172+
return {
173+
**item,
174+
"image_url": {
175+
**item["image_url"],
176+
"url": redact_base64_data_url(item["image_url"]["url"]),
177+
},
178+
}
179+
180+
if item.get("type") == "image" and "data" in item:
181+
return {**item, "data": redact_base64_data_url(item["data"])}
182+
183+
if (
184+
item.get("type") == "image"
185+
and isinstance(item.get("source"), dict)
186+
and "data" in item["source"]
187+
):
188+
# Anthropic style - raw base64 in structured format, always redact
189+
return {
190+
**item,
191+
"source": {
192+
**item["source"],
193+
"data": REDACTED_IMAGE_PLACEHOLDER,
194+
},
195+
}
196+
197+
if item.get("type") == "media" and "data" in item:
198+
return {**item, "data": redact_base64_data_url(item["data"])}
199+
200+
return item
201+
202+
203+
def sanitize_openai(data: Any) -> Any:
204+
return process_messages(data, sanitize_openai_image)
205+
206+
207+
def sanitize_openai_response(data: Any) -> Any:
208+
return process_messages(data, sanitize_openai_response_image)
209+
210+
211+
def sanitize_anthropic(data: Any) -> Any:
212+
return process_messages(data, sanitize_anthropic_image)
213+
214+
215+
def sanitize_gemini(data: Any) -> Any:
216+
if not data:
217+
return data
218+
219+
if isinstance(data, list):
220+
return [process_gemini_item(item) for item in data]
221+
222+
return process_gemini_item(data)
223+
224+
225+
def sanitize_langchain(data: Any) -> Any:
226+
return process_messages(data, sanitize_langchain_image)

0 commit comments

Comments
 (0)